obtain_date.py 文件源码-python代码片段

obtain_date.py 文件源码

python

阅读 35 收藏 0 点赞 0 评论 0

项目：Broad_Crawler 作者: rafacheng 项目源码文件源码

def pre_process(dates):
    # \u5e74\u6708\u65e5
    # dates = [re.subn(r'[\u4e00-\u5e73]|[\u5e75-\u6707]', '', tm)[0] for tm in dates]
    # dates = [re.subn(r'[\u6709-\u65e4]|[\u65e6-\u9fa5]', '', tm)[0] for tm in dates]
    # ???
    dates = dates[:int(len(dates) / 2)]
    # ?????100??
    dates = [d for d in dates if 100 > len(d) > 0]
    # ????????
    dates = [d.replace('\n', ' ').replace('\t', ' ').replace('\b', ' ').replace('&nbsp;', ' ') for d in dates]
    # ????????????????
    new_dates = []
    for v in dates:
        v = re.sub('\A\s*', '', v)
        v = re.sub('\s*\Z', '', v)
        v = re.sub('\s+', ' ', v)
        new_dates.append(v)
    # ???????????????????????
    shorter_dates = []
    for v in new_dates:
        ffdi = find_first_digit_index(v)
        if ffdi:
            shorter_dates.append(v[ffdi-1:])
    return shorter_dates