def parse_venues(venue_types) :
methods = {'conf' : clean_conf,
'journals' : clean_journal}
venues = []
for venue_type in venue_types :
folder = config.DATA + ("venues/html/%s/" % venue_type)
print "\nProcessing folder '%s'" % folder
for file_name in os.listdir(folder) :
print " '%s'" % file_name
with open(os.path.join(folder, file_name), 'r') as file :
lines = file.readlines()
# Get the line of interest and parse it as an HTML
html = lxml.html.fromstring(lines[16])
for item in html.xpath("//div[@id='browse-%s-output']//li/a" % venue_type) :
process_method = methods[venue_type]
name = process_method(item.text_content())
venues.append((name, venue_type))
print "%d venues." % len(venues)
return venues
评论列表
文章目录