def main_craw_ptt(i,ptt_class_name,sql_name,bo):
#ptt_class_name = 'Soft_Job'
index_name = 'http://www.ptt.cc'
index_class = '/bbs/' + ptt_class_name + '/index'
# i=4806, i=18
index_url = index_name + index_class +str(i)+'.html' # ?? i ???
#res = requests.get(index_url,verify = False)
res = requests.get(index_url,verify = True) # ?? html ???
soup = BeautifulSoup(res.text, "lxml")# html???? ???????
temp = soup.find_all("",{'class':'r-ent'})
for i in range( len( temp ) ): # i=0 len( temp )
#print(i)
temp2 = temp[i].find('a')
if( str( temp2 ) == 'None' ):# ??????? ? return error, ??????????
print('error')
elif( str( temp2 ) != 'None' ):# ???????
#print(i)
article_url = temp[i].find('a')['href']# ?????
article_url = index_name+article_url# ? index ??
title = temp[i].find('a').get_text()# ? title
# article_url = 'https://www.ptt.cc/bbs/Soft_Job/M.1503652456.A.526.html'
response = requests.session().get( article_url )#response, ????, 200????
if( response.status_code == 404 ):
print(404)
elif( re.search('[??]',title) ):# ????
print('[??]')
elif( response.status_code == 200 ):# 200????
if(bo == 'new'):# ??data, ??????,
# max date time ??, ??sql?? max time, ??????, ??
date_time = catch_ptt_history_date_time(ptt_class_name,sql_name)
max_date_time = date_time
elif(bo == 'his'):# ?????, ???????, ?????
max_date_time = 0
tem = craw_ptt_data_fun(article_url,temp,i,index_url,sql_name,max_date_time,bo)
else:
print('other')
#---------------------------------------------------------------------------------
# ???? data, ????????? index=100 ?, ?5????error, ???6?????
评论列表
文章目录