def scrape_all_posts_unflat(url: str, verbose: bool, cache: bool) -> List[List['Post']]:
unflat_posts = []
fget = requests.get if not cache else memory.cache(requests.get)
page = fget(url).text # Downloads the page twice.
# ^ we can scrape_page(page), .append, [urls - url], but KISS.
n_of_pages = pq(page).find('.pagejump > a').eq(0).text().strip().split(' ')[-1] # Gets '10' from 'Page 1 of 10'
# If there is only one page
if(n_of_pages is ''):
urls = [url]
else:
url_prefix_match = re.match('(.*)(page-[0-9]+)', url)
url_prefix = url if url_prefix_match is None else url_prefix_match.group(1)
if(url_prefix[-1] != '/'): url_prefix += '/'
urls = [(url_prefix + 'page-' + str(n + 1)) for n in range(int(n_of_pages))]
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
fscrape = scrape_posts if not cache else memory.cache(scrape_posts, ignore=['verbose'])
futures = [executor.submit(fscrape, url, verbose) for url in urls]
results, _ = concurrent.futures.wait (futures)
for result in results:
unflat_posts.append(result.result())
return unflat_posts
评论列表
文章目录