def get_ranking(html_source, date, category):
html = lxml.html.fromstring(html_source)
ranking_obj = []
for i, rank_type in [(2,'free'), (3, 'paid'), (4, 'grossing')]:
app_names=[]
app_urls = []
seller_names = []
seller_ids = []
store_app_ids=[]
ranks = range(1, len(html.xpath("//tr/td[%d]//*[contains(@class, 'app-name')]/span"%i))+1 )
for app_name in html.xpath("//tr/td[%d]//*[contains(@class, 'app-name')]/span"%i):
try:
app_names.append(app_name.text[:150])
except:
app_names.append(0)
""" We can use these urls to get the missing app_ids later (if needed)"""
for app_url in html.xpath("//tr/td[%d]//*[contains(@class, 'app-name')]"%i):
try:
app_urls.append(app_url.attrib['href'])
except:
app_urls.append(0)
for img in html.xpath('//tr/td[%d]/div/div/a/img'%i):
store_app_id = img.attrib['src']
try:
store_app_id = re.search('ios/(.+)/', store_app_id).group(1)
store_app_ids.append(store_app_id)
except:
store_app_ids.append(0)
for seller_name in html.xpath("//tr/td[%d]//*[contains(@class, 'publisher-name')]/span"%i):
try:
seller_names.append(seller_name.text[:150])
except:
seller_names.append(0)
for seller in html.xpath("//tr/td[%d]//*[contains(@class, 'publisher-name')]"%i):
seller_id = seller.attrib['href']
try:
seller_id = re.search('(company|publisher)/(.+)/', seller_id).group(2)
seller_ids.append(seller_id)
except:
seller_ids.append(0)
for rank, store_app_id, app_name, seller_id, seller_name, app_url in zip(ranks, store_app_ids, app_names, seller_ids, seller_names, app_urls):
ranking_obj.append(
AppAnnieRankings(store_app_id=store_app_id,
app_name=app_name[:200],
rank_type=rank_type,
category=category[:200],
seller_id=seller_id,
seller_name=seller_name[:200],
app_url=app_url,
rank=rank,
date=date)
)
AppAnnieRankings.objects.bulk_create(ranking_obj, batch_size=10000)
评论列表
文章目录