def scrape_category_page(url):
global ALL_TEXT, non_bmp_map, threads, count
soup = BeautifulSoup(urllib.request.urlopen(url), 'lxml', parse_only=SoupStrainer('div'))
### accounts for categories with over 200 pages
link = soup.find('a', href=True, text='next page')
if (link != None):
try:
t = catThread('https://en.wikipedia.org' + link['href'])
t.daemon = True
t.start()
threads.append(t)
except:
print ("Error: Unable to thread.")
### sends links of wikipedia articles in the category to be scraped
pages_in_category = soup.find('div', {'id':'mw-pages'}).find('div',{'class':'mw-category'})
for obj in pages_in_category.findAll('a'):
tempbun = scrape(Bundle('https://en.wikipedia.org' + obj['href'], False))
with lock:
ALL_TEXT += tempbun.text.translate(non_bmp_map)
print (count)
count += 1
ThreadedCategoryScraper.py 文件源码
python
阅读 23
收藏 0
点赞 0
评论 0
评论列表
文章目录