def __iter__(self):
with conn.cursor(cursor_factory=DictCursor) as cur:
# TODO: save names of table and database
# to a central location. For now, db=arxive and table=articles
cur.execute("SELECT * FROM articles;")
for article in cur:
abstract = article['abstract'].replace('\n', ' ').strip()
# train on body, composed of title and abstract
body = article['title'] + '. '
body += abstract
# We want to keep some punctuation, as Word2Vec
# considers them useful context
words = re.findall(r"[\w']+|[.,!?;]", body)
# lowercase. perhaps lemmatize too?
words = [word.lower() for word in words]
# document tag. Unique integer 'index' is good.
# can also add topic tag of form
# 'topic_{subject_id}' to list
#tags = [article['index'], article['subject']]
tags = [article['index']]
yield TaggedDocument(words, tags)
评论列表
文章目录