def fix_data(i,ptt_class_name,sql_name,bo,j):
#ptt_class_name = 'Soft_Job'
index_name = 'http://www.ptt.cc'
index_class = '/bbs/' + ptt_class_name + '/index'
# i=4806, i=18
index_url = index_name + index_class +str(i)+'.html'
#res = requests.get(index_url,verify = False)
# index_url = 'http://www.ruten.com.tw/'
res = requests.get(index_url,verify = True)
soup = BeautifulSoup(res.text, "lxml")
temp = soup.find_all("",{'class':'r-ent'})
for i in range( j,len( temp ) ): # i=12 len( temp )
#print(i)
temp2 = temp[i].find('a')
if( str( temp2 ) == 'None' ):
print('error')
elif( str( temp2 ) != 'None' ):
#print(i)
article_url = temp[i].find('a')['href']
article_url = index_name+article_url
title = temp[i].find('a').get_text()
# article_url = 'https://www.ptt.cc/bbs/Soft_Job/M.1503652456.A.526.html'
response = requests.session().get( article_url )
if( response.status_code == 404 ):
print(404)
elif( re.search('[??]',title) ):
print('[??]')
elif( response.status_code == 200 ):
if(bo == 'new'):
date_time = catch_ptt_history_date_time(ptt_class_name,sql_name)
max_date_time = max(date_time)
elif(bo == 'his'):
max_date_time = 0
tem = craw_ptt_data_fun(article_url,temp,i,index_url,sql_name,max_date_time,bo)
else:
print('other')
#---------------------------------------------------------------------------------
# ?????, ??????
评论列表
文章目录