def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
python类Dict()的实例源码
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
def __init__(self, dict_name='en', max_dist=2):
self.spell_dict = enchant.Dict(dict_name)
self.max_dist = max_dist
def open(self):
self.initialized = False
self.private_dict_file = None
if enchant is None:
return
dict_name = self.config.spelling_dict
if not dict_name:
return
self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
# "param" appears in docstring in param description and
# "pylint" appears in comments in pylint pragmas.
self.ignore_list.extend(["param", "pylint"])
if self.config.spelling_private_dict_file:
self.spelling_dict = enchant.DictWithPWL(
dict_name, self.config.spelling_private_dict_file)
self.private_dict_file = open(
self.config.spelling_private_dict_file, "a")
else:
self.spelling_dict = enchant.Dict(dict_name)
if self.config.spelling_store_unknown_words:
self.unknown_words = set()
# Prepare regex for stripping punctuation signs from text.
# ' and _ are treated in a special way.
puncts = string.punctuation.replace("'", "").replace("_", "")
self.punctuation_regex = re.compile('[%s]' % re.escape(puncts))
self.initialized = True
def __init__(self,lang=None,text=None,tokenize=None,chunkers=None,filters=None):
"""Constructor for the SpellChecker class.
SpellChecker objects can be created in two ways, depending on
the nature of the first argument. If it is a string, it
specifies a language tag from which a dictionary is created.
Otherwise, it must be an enchant Dict object to be used.
Optional keyword arguments are:
* text: to set the text to be checked at creation time
* tokenize: a custom tokenization function to use
* chunkers: a list of chunkers to apply during tokenization
* filters: a list of filters to apply during tokenization
If <tokenize> is not given and the first argument is a Dict,
its 'tag' attribute must be a language tag so that a tokenization
function can be created automatically. If this attribute is missing
the user's default language will be used.
"""
if lang is None:
lang = get_default_language()
if isinstance(lang,basestring):
dict = enchant.Dict(lang)
else:
dict = lang
try:
lang = dict.tag
except AttributeError:
lang = get_default_language()
if lang is None:
raise DefaultLanguageNotFoundError
self.lang = lang
self.dict = dict
if tokenize is None:
try:
tokenize = get_tokenizer(lang,chunkers,filters)
except TokenizerNotFoundError:
# Fall back to default tokenization if no match for 'lang'
tokenize = get_tokenizer(None,chunkers,filters)
self._tokenize = tokenize
self.word = None
self.wordpos = None
self._ignore_words = {}
self._replace_words = {}
# Default to the empty string as the text to be checked
self._text = array.array('u')
self._use_tostring = False
self._tokens = iter([])
if text is not None:
self.set_text(text)
def getInfoFromEmail(emailData):
msgTextList = getEmailText(emailData[0][1])
for msgText in msgTextList:
confNum = getConfNum(msgText)
# see if there are multiple itineraries
msgTextSplit = msgText.split()
if confNum in msgTextSplit:
confNumIndex = msgTextSplit.index(confNum)
else:
confNumIndex = msgTextSplit.index('*'+confNum+'*')
firstName = msgTextSplit[confNumIndex+1]
lastName = msgTextSplit[confNumIndex+2]
if 'Passenger(s)' in firstName:
# See if there is a / in the name
if '/' in lastName:
firstName = lastName[lastName.index('/')+1:]
lastName = lastName[0:lastName.index('/')]
else:
print("PROBLEM PARSING THE FIRST AND LAST NAMES!")
elif msgTextSplit[confNumIndex+4] == 'Date':
lastName = msgTextSplit[confNumIndex+3]
print("Make sure user used a middle initial")
# see if there are < formatting issues
if firstName == '>':
firstName = msgTextSplit[confNumIndex+2]
lastName = msgTextSplit[confNumIndex+4]
if lastName == '>':
print("AAAH")
print(msgTextSplit[confNumIndex+3])
lastName = msgTextSplit[confNumIndex+3]
possible2ndConf = msgTextSplit[confNumIndex+3][1:-1]
if len(possible2ndConf) == 6 and not enchant.Dict("en_US").check(possible2ndConf):
confNum = [confNum,str(possible2ndConf)]
firstName = [firstName, str(msgTextSplit[confNumIndex+4])]
lastName = [lastName, str(msgTextSplit[confNumIndex+5])]
else:
confNum = [confNum]
firstName = [firstName]
lastName = [lastName]
# get the time you need to check in
checkInTime = getCheckInTime(msgText)
checkInDate = getCheckInDate(msgText)
checkInCity = getCheckInCity(msgText)
try:
infoList = []
for j in xrange(len(checkInDate)):
for i in xrange(len(firstName)):
info = {'confNum':confNum[i],
'firstName':firstName[i],
'lastName':lastName[i],
'datetime':parser.parse(checkInDate[j] + ' ' + \
checkInTime[j]),
'city':checkInCity[j]}
infoList.append(info)
except:
infoList = []
print('info from email:')
print(infoList)
return infoList
def consolidate_carevue(carevue):
"""Consolidate itsems from CV.
"""
cv_item_text = clean_text(carevue['label'])
cv_vectorizer = CountVectorizer(analyzer = "word")
cv_bow_data = cv_vectorizer.fit_transform(cv_item_text)
cv_vocab = cv_vectorizer.get_feature_names()
cv_counts = cv_bow_data.sum(axis=0)
# Compute edit distance between each element in vocabulary
# with "dictionary"
correct_by_count = []
corrected = {}
count = 0
corrected_words = []
no_match = []
d = enchant.request_pwl_dict(
main_dir + "/metavision_ids_icds_vocab_new.txt")
d_english = enchant.Dict("en_US")
for word in cv_vocab:
word = word.lower()
count += 1
if not d.check(word) and not d.check(word.upper()) \
and not d_english.check(word):
no_match.append(word)
suggestions = d.suggest(word)
if suggestions == []:
corrected[word] = word
else:
corrected[word] = best_match(word, suggestions, [])
corrected_words.append(word)
else:
corrected[word] = word
# apply map to correct spellings
cv_item_corrected = \
cv_item_text.str.split().apply(translate_words, args=(corrected,))
cv_items_spellcheck = cv_item_corrected.str.join(' ')
cv_items_df = pd.DataFrame({'itemid': cv_items_spellcheck.index.values,
'label': cv_items_spellcheck.values})
grouped = cv_items_df[['itemid', 'label']].groupby('label')
grouped_trimmed = {}
for key in grouped.groups.keys():
# take the minimum itemid corresponding to this description.
grouped_trimmed[key] = grouped.get_group(key).itemid.astype(str).min()
dict_consolidate = {}
for itemid in cv_items_df.itemid.astype(str):
dict_consolidate[itemid] = []
for key in grouped.groups.keys():
values = grouped.get_group(key)
min_val = min(values.itemid.astype(str))
for val in values.itemid.astype(str):
dict_consolidate[val].append(min_val)
map_to_unique = set()
for key in dict_consolidate:
if min(dict_consolidate[key]) not in map_to_unique:
map_to_unique.add(min(dict_consolidate[key]))
cv_items_spellcheck.index = cv_items_spellcheck.index.astype(str)
# filter cv_items_spellcheck so that there are no redundant items
cv_items_spellcheck2 = cv_items_spellcheck.loc[map_to_unique]
return cv_item_text, cv_items_spellcheck, \
cv_items_spellcheck2, dict_consolidate