google.py 文件源码-python代码片段

google.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：script.module.metadatautils 作者: marcelveldt 项目源码文件源码

def get_data(self, search_query):
        '''helper method to get data from google images by scraping and parsing'''
        params = {"site": "imghp", "tbm": "isch", "tbs": "isz:l", "q": search_query}
        headers = {'User-agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; \
            IEMobile/7.0; LG; GW910)'}
        html = ''
        try:
            html = requests.get('https://www.google.com/search', headers=headers, params=params, timeout=5).text
        except Exception as exc:
            log_exception(__name__, exc)
        soup = BeautifulSoup.BeautifulSoup(html)
        results = []
        for div in soup.findAll('div'):
            if div.get("id") == "images":
                for a_link in div.findAll("a"):
                    page = a_link.get("href")
                    try:
                        img = page.split("imgurl=")[-1]
                        img = img.split("&imgrefurl=")[0]
                        results.append(img)
                    except Exception:
                        pass
        return results