def standardize_act_target(tgt_url):
''' Produce label for identified Act target based on URL. This is cleaner
than pulling the text and accounting for typos and inconsistencies.'''
surl = tgt_url.split("/")
date = surl[3].split("-")
date = "{} {}, {}".format(calendar.month_name[int(date[1])], date[2], date[0])
try:
tgt_title = "Act of {}, ch. {} {}".format(date, surl[4].strip("ch"), surl[5].strip("s"))
except:
try:
if "ch" in surl[4]:
tgt_title = "Act of {}, ch. {}".format(date, surl[4].strip("ch"))
elif "s" in surl[4]:
tgt_title = "Act of {}, {}".format(date, surl[4].strip("s"))
except:
tgt_title = "Act of {}".format(date)
try: surl[4] = surl[4].lstrip("ch")
except: pass
try: surl[5] = surl[5].lstrip("s")
except: pass
tgt_url = "/".join(x for x in surl)
try:
tgt_url_broad = "/".join(tgt_url.split("/")[0:6])
except:
tgt_url_broad = "/".join(tgt_url.split("/")[0:5])
return tgt_title, tgt_url, tgt_url_broad
评论列表
文章目录