def consolidate_carevue(carevue):
"""Consolidate itsems from CV.
"""
cv_item_text = clean_text(carevue['label'])
cv_vectorizer = CountVectorizer(analyzer = "word")
cv_bow_data = cv_vectorizer.fit_transform(cv_item_text)
cv_vocab = cv_vectorizer.get_feature_names()
cv_counts = cv_bow_data.sum(axis=0)
# Compute edit distance between each element in vocabulary
# with "dictionary"
correct_by_count = []
corrected = {}
count = 0
corrected_words = []
no_match = []
d = enchant.request_pwl_dict(
main_dir + "/metavision_ids_icds_vocab_new.txt")
d_english = enchant.Dict("en_US")
for word in cv_vocab:
word = word.lower()
count += 1
if not d.check(word) and not d.check(word.upper()) \
and not d_english.check(word):
no_match.append(word)
suggestions = d.suggest(word)
if suggestions == []:
corrected[word] = word
else:
corrected[word] = best_match(word, suggestions, [])
corrected_words.append(word)
else:
corrected[word] = word
# apply map to correct spellings
cv_item_corrected = \
cv_item_text.str.split().apply(translate_words, args=(corrected,))
cv_items_spellcheck = cv_item_corrected.str.join(' ')
cv_items_df = pd.DataFrame({'itemid': cv_items_spellcheck.index.values,
'label': cv_items_spellcheck.values})
grouped = cv_items_df[['itemid', 'label']].groupby('label')
grouped_trimmed = {}
for key in grouped.groups.keys():
# take the minimum itemid corresponding to this description.
grouped_trimmed[key] = grouped.get_group(key).itemid.astype(str).min()
dict_consolidate = {}
for itemid in cv_items_df.itemid.astype(str):
dict_consolidate[itemid] = []
for key in grouped.groups.keys():
values = grouped.get_group(key)
min_val = min(values.itemid.astype(str))
for val in values.itemid.astype(str):
dict_consolidate[val].append(min_val)
map_to_unique = set()
for key in dict_consolidate:
if min(dict_consolidate[key]) not in map_to_unique:
map_to_unique.add(min(dict_consolidate[key]))
cv_items_spellcheck.index = cv_items_spellcheck.index.astype(str)
# filter cv_items_spellcheck so that there are no redundant items
cv_items_spellcheck2 = cv_items_spellcheck.loc[map_to_unique]
return cv_item_text, cv_items_spellcheck, \
cv_items_spellcheck2, dict_consolidate
correct_item_descriptions.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录