def guba_sina(show_content=False):
"""
??sina???????????
Parameter
--------
show_content:?????????False
Return
--------
DataFrame
title, ????
content, ?????show_content=True?????
ptime, ????
rcounts,????
"""
from pandas.io.common import urlopen
try:
with urlopen(nv.GUBA_SINA_URL%(ct.P_TYPE['http'],
ct.DOMAINS['sina'])) as resp:
lines = resp.read()
html = lxml.html.document_fromstring(lines)
res = html.xpath('//ul[@class=\"list_05\"]/li')
heads = html.xpath('//div[@class=\"tit_04\"]')
data = []
for head in heads[:1]:
title = head.xpath('a/text()')[0]
url = head.xpath('a/@href')[0]
ds = [title]
ds.extend(_guba_content(url))
data.append(ds)
for row in res:
title = row.xpath('a[2]/text()')[0]
url = row.xpath('a[2]/@href')[0]
ds = [title]
ds.extend(_guba_content(url))
data.append(ds)
df = pd.DataFrame(data, columns=nv.GUBA_SINA_COLS)
df['rcounts'] = df['rcounts'].astype(float)
return df if show_content is True else df.drop('content', axis=1)
except Exception as er:
print(str(er))
评论列表
文章目录