ThreadedCategoryScraper.py 文件源码-python代码片段

ThreadedCategoryScraper.py 文件源码

python

阅读 23 收藏 0 点赞 0 评论 0

项目：Wakapedia 作者: ACMProjectsTeam3 项目源码文件源码

def scrape_category_page(url):
    global ALL_TEXT, non_bmp_map, threads, count
    soup = BeautifulSoup(urllib.request.urlopen(url), 'lxml', parse_only=SoupStrainer('div'))

      ### accounts for categories with over 200 pages
    link = soup.find('a', href=True, text='next page')
    if (link != None):
        try:
            t = catThread('https://en.wikipedia.org' + link['href'])
            t.daemon = True
            t.start()
            threads.append(t)
        except:
            print ("Error: Unable to thread.")

      ### sends links of wikipedia articles in the category to be scraped
    pages_in_category = soup.find('div', {'id':'mw-pages'}).find('div',{'class':'mw-category'})
    for obj in pages_in_category.findAll('a'):
        tempbun = scrape(Bundle('https://en.wikipedia.org' + obj['href'], False))
        with lock:
            ALL_TEXT += tempbun.text.translate(non_bmp_map)
            print (count)
            count += 1