def pre_process(dates):
# \u5e74\u6708\u65e5
# dates = [re.subn(r'[\u4e00-\u5e73]|[\u5e75-\u6707]', '', tm)[0] for tm in dates]
# dates = [re.subn(r'[\u6709-\u65e4]|[\u65e6-\u9fa5]', '', tm)[0] for tm in dates]
# ???
dates = dates[:int(len(dates) / 2)]
# ?????100??
dates = [d for d in dates if 100 > len(d) > 0]
# ????????
dates = [d.replace('\n', ' ').replace('\t', ' ').replace('\b', ' ').replace(' ', ' ') for d in dates]
# ????????????????
new_dates = []
for v in dates:
v = re.sub('\A\s*', '', v)
v = re.sub('\s*\Z', '', v)
v = re.sub('\s+', ' ', v)
new_dates.append(v)
# ???????????????????????
shorter_dates = []
for v in new_dates:
ffdi = find_first_digit_index(v)
if ffdi:
shorter_dates.append(v[ffdi-1:])
return shorter_dates
评论列表
文章目录