def process_readme(idx, readme_filename, s3_bucket):
if readme_filename.startswith('npm/'):
package_name = readme_filename[len('npm/'):]
if package_name.endswith('/README.json'):
package_name = package_name[:-len('/README.json')]
try:
readme_content = s3_bucket.read_json_file(readme_filename)
except Exception:
_logger.warning("[MISSING_DATA] Readme/NPMJS description for package {} does "
"not exist in S3.".format(package_name))
return
if not readme_content:
npmjs_description = getNPMdescription(package_name)
if not npmjs_description:
_logger.warning("[MISSING_DATA] Readme/NPMJS description for package {} does "
"not exist in S3.".format(package_name))
return
else:
readme_content = {
'type': 'plaintext',
'content': npmjs_description
}
if readme_content['type'] == 'Markdown' or readme_content['type'] == 'plaintext':
readme_content['content'] = returnContentIfAscii(
readme_content['content'].replace('\n', ' '))
if not readme_content['content']:
_logger.warning("[ENCODING] Ignoring package {} as the readme is not in"
" ascii".format(package_name))
return
if readme_content['type'] == 'Markdown':
try:
readme_content = markdown_preprocess(
readme_content['content'])
except Exception:
_logger.warning(
"[CONTENT] Could not get tags for {}".format(package_name))
return
else:
readme_content = readme_content['content']
with open(os.path.join(PATH_PREFIX, package_name.replace('/', ':::')), 'w') as of:
of.write(json.dumps({"id": idx, "text": readme_content}))
curfilename = of.name
of.close()
try:
tags = run_pipeline(curfilename)
if tags:
print(tags)
tags_dict[package_name] = tags
except Exception:
_logger.warning(
"[CONTENT] Could not get tags for {}".format(package_name))
os.remove(curfilename)
else:
_logger.warning("[FORMAT] Skipping {}, content is not in markdown format"
" but in {}.".format(readme_filename, readme_content['type']))
pytextrank_textrank_scoring.py 文件源码
python
阅读 28
收藏 0
点赞 0
评论 0
评论列表
文章目录