train.py 文件源码-python代码片段

train.py 文件源码

python

阅读 18 收藏 0 点赞 0 评论 0

项目：arxiv-doc2vec-recommender 作者: sepehr125 项目源码文件源码

def __iter__(self):
        with conn.cursor(cursor_factory=DictCursor) as cur:
            # TODO: save names of table and database
            # to a central location. For now, db=arxive and table=articles
            cur.execute("SELECT * FROM articles;")
            for article in cur:
                abstract = article['abstract'].replace('\n', ' ').strip()
                # train on body, composed of title and abstract
                body = article['title'] + '. '
                body += abstract
                # We want to keep some punctuation, as Word2Vec
                # considers them useful context
                words = re.findall(r"[\w']+|[.,!?;]", body)
                # lowercase. perhaps lemmatize too?
                words = [word.lower() for word in words]
                # document tag. Unique integer 'index' is good.
                # can also add topic tag of form
                # 'topic_{subject_id}' to list
                #tags = [article['index'], article['subject']]
                tags = [article['index']]

                yield TaggedDocument(words, tags)