correct_item_descriptions.py 文件源码-python代码片段

def consolidate_carevue(carevue):
    """Consolidate itsems from CV.
    """
    cv_item_text = clean_text(carevue['label'])
    cv_vectorizer = CountVectorizer(analyzer = "word")
    cv_bow_data = cv_vectorizer.fit_transform(cv_item_text)
    cv_vocab = cv_vectorizer.get_feature_names()
    cv_counts = cv_bow_data.sum(axis=0)

    # Compute edit distance between each element in vocabulary
    # with "dictionary"
    correct_by_count = []
    corrected = {}
    count = 0
    corrected_words = []
    no_match = []
    d = enchant.request_pwl_dict(
        main_dir + "/metavision_ids_icds_vocab_new.txt")
    d_english = enchant.Dict("en_US")
    for word in cv_vocab:
        word = word.lower()
        count += 1
        if not d.check(word) and not d.check(word.upper()) \
           and not d_english.check(word):
            no_match.append(word)
            suggestions = d.suggest(word)
            if suggestions == []:
                corrected[word] = word
            else:
                corrected[word] = best_match(word, suggestions, [])
                corrected_words.append(word)
        else:
            corrected[word] = word
    # apply map to correct spellings
    cv_item_corrected = \
        cv_item_text.str.split().apply(translate_words, args=(corrected,))
    cv_items_spellcheck = cv_item_corrected.str.join(' ')
    cv_items_df = pd.DataFrame({'itemid': cv_items_spellcheck.index.values,
                                'label': cv_items_spellcheck.values})
    grouped = cv_items_df[['itemid', 'label']].groupby('label')
    grouped_trimmed = {}
    for key in grouped.groups.keys():
        # take the minimum itemid corresponding to this description.
        grouped_trimmed[key] = grouped.get_group(key).itemid.astype(str).min()
    dict_consolidate = {}
    for itemid in cv_items_df.itemid.astype(str):
        dict_consolidate[itemid] = []
    for key in grouped.groups.keys():
        values = grouped.get_group(key)
        min_val = min(values.itemid.astype(str))
        for val in values.itemid.astype(str):
            dict_consolidate[val].append(min_val)
    map_to_unique = set()
    for key in dict_consolidate:
        if min(dict_consolidate[key]) not in map_to_unique:
            map_to_unique.add(min(dict_consolidate[key]))
    cv_items_spellcheck.index = cv_items_spellcheck.index.astype(str)
    # filter cv_items_spellcheck so that there are no redundant items
    cv_items_spellcheck2 = cv_items_spellcheck.loc[map_to_unique]
    return cv_item_text, cv_items_spellcheck, \
        cv_items_spellcheck2, dict_consolidate