HTMLscraper.py 文件源码-python代码片段

def scrape(bun):
    ### opens url so it's like a file
  link = urllib.request.urlopen(bun.URL)

  soup = None
    ### flag for retrieving categories (or not)
  if bun.categories:
    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml')
  else:
    p_tags = SoupStrainer('p')
    soup = BeautifulSoup(link.read().decode('utf-8'), 'lxml', parse_only=p_tags)

    ### dictionary of paragraphs
  doc = {}
    ### add token and count to replace paragraphs in HTML
  token = 'Waka'
  count = 0

    ### all the paragraph texts in one string
  alltxt = ''
    ### iterate thru the <p> tags
  for para in soup.find_all('p'):
      ### put raw text in dictionary
    doc[token+str(count)] = para.get_text()
    alltxt = alltxt + para.get_text() + ' '
      ### replace <p> contents with a token
    para.string = token + str(count)
    count+=1

    ### get the list of categories
  cats = []
  if bun.categories:
    for cat in soup.find('div', {'id': 'catlinks'}).find('ul').findAll('li'):
      cats.append('https://en.wikipedia.org' + cat.find('a')['href'])

  for css in soup.find_all('link', rel='stylesheet'):
    css['href'] = '//en.wikipedia.org/' + css['href']

  for js in soup.find_all('script', src=re.compile('.*')):
    js['src'] = '//en.wikipedia.org/' + js['src']

    ### update stuff in Bundle
  bun.paragraphs = doc
  bun.text = alltxt
  bun.html = str(soup.encode('ascii', 'xmlcharrefreplace').decode('utf-8'))
  bun.categories = cats

  return bun