def retrieve_onet_titles(self):
onet_titles = pd.concat(
(pd.read_csv(self.onet_downloader.download(
version,
'Occupation Data.txt',
'occupation_data.txt'
), sep='\t') for version in ONET_VERSIONS),
ignore_index=True
)
# Assumes pandas 0.19, keeps newest duplicate Title
onet_titles.drop_duplicates('Title', inplace=True, keep='last')
onet_titles['Major'] = onet_titles.iloc[:, 0].apply(lambda x: x[:2])
LOWER = True
if LOWER:
# all RDD strings are unicode
onet_titles['Title'] = onet_titles['Title'].str.lower()
onet_titles['Description'] = onet_titles['Description'].str.lower()
# now we can do a title -> Major, Minor lookup
onet_titles.set_index('Title', inplace=True)
# access with onet_titles.loc[u'Sales Agents, Financial Services']
return onet_titles
esa_jobtitle_normalizer.py 文件源码
python
阅读 32
收藏 0
点赞 0
评论 0
评论列表
文章目录