def search(searchTerm, list, keyName : str = None, numMatches : int = 3):
"""Searches the provided list for the searchTerm - using a keyName if provided for dicts."""
if len(list) < 1:
return None
# Iterate through the list and create a list of items
searchList = []
for item in list:
if keyName:
testName = item[keyName]
else:
testName = item
matchRatio = difflib.SequenceMatcher(None, searchTerm.lower(), testName.lower()).ratio()
# matchRatio = Levenshtein.ratio(searchTerm.lower(), testName.lower())
searchList.append({ 'Item' : item, 'Ratio' : matchRatio })
# sort the servers by population
searchList = sorted(searchList, key=lambda x:x['Ratio'], reverse=True)
if numMatches > len(searchList):
# Less than three - let's just give what we've got
numMatches = len(searchList)
return searchList[:numMatches]
python类SequenceMatcher()的实例源码
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def filecompare( self, filename_left, filename_right ):
if type(filename_left) == type([]):
lines_left = filename_left
else:
try:
lines_left = wb_read_file.readFileContentsAsUnicode( filename_left ).split('\n')
except IOError as e:
print( 'Error opening %s\n%s' % (filename_left, e) )
return 0
if type(filename_right) == type([]):
lines_right = filename_right
else:
try:
lines_right = wb_read_file.readFileContentsAsUnicode( filename_right ).split('\n')
except IOError as e:
print( 'Error opening %s\n%s' % (filename_right, e) )
return 0
lines_left = [eolRemoval( line ) for line in lines_left]
lines_right = [eolRemoval( line ) for line in lines_right]
matcher = difflib.SequenceMatcher( isLineJunk, lines_left, lines_right )
for tag, left_lo, left_hi, right_lo, right_hi in matcher.get_opcodes():
if tag == 'replace':
self.fancy_replace( lines_left, left_lo, left_hi, lines_right, right_lo, right_hi )
elif tag == 'delete':
self.dump( self.text_body.addDeletedLine, lines_left, left_lo, left_hi )
elif tag == 'insert':
self.dump( self.text_body.addInsertedLine, lines_right, right_lo, right_hi )
elif tag == 'equal':
self.dump( self.text_body.addNormalLine, lines_left, left_lo, left_hi )
else:
raise ValueError( 'unknown tag ' + str( tag ) )
self.text_body.addEnd()
return 1
# need to strip any \n or \r thats on the end of the line
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split(r'\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity:
try:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
except FieldDoesNotExist:
verbose_name = attribute_name
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split(r'\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() >= self.max_similarity:
try:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
except FieldDoesNotExist:
verbose_name = attribute_name
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def get_matching_blocks(self):
size = min(len(self.b), len(self.b))
threshold = min(self.threshold, size / 4)
actual = difflib.SequenceMatcher.get_matching_blocks(self)
return [item for item in actual
if item[2] > threshold
or not item[2]]
def get_scored_matches(word: str, possibilities: List[str], n: int=3, cutoff: float=0.6) -> List[Tuple[float, str]]:
if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not (0.0 <= cutoff <= 1.0):
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s: SequenceMatcher = SequenceMatcher()
s.set_seq2(word)
for x in possibilities:
s.set_seq1(x)
if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
result.append((s.ratio(), x))
# Move the best scorers to head of list
result = heapq.nlargest(n, result)
# Strip scores for the best n matches
return result
def build_token_counts(characterizer, texts):
tokenizer = Tokenizer(characterizer=characterizer)
tokenizer.train([t['text'] for t in texts])
token_counts = Counter()
seq_matcher = difflib.SequenceMatcher()
for t in texts:
t['tokens'] = tokenizer.tokenize(t['text'])
if not t['tokens']:
continue
if 'urls' in t['entities'] and t['entities']['urls']:
#TODO: replace those urls instead of adding them
for url in t['entities']['urls']:
t['tokens'].append(url['display_url'])
if t['__is_rt__']:
t['tokens'].append(u'@{0}'.format(t['user']['screen_name']).lower())
token_counts.update(t['tokens'])
return token_counts
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def sededit(a, b, context=0):
'''
Take two strings and output a sed-like diff
'''
if a == b:
return ''
a_len = len(a)
b_len = len(b)
start1, end1, start2, end2 = a_len, 0, b_len, 0
s = difflib.SequenceMatcher(None, a, b)
for tag, i1, i2, j1, j2 in s.get_opcodes():
if tag == 'equal':
continue
elif tag == 'insert':
ins = 1
else:
ins = 0
start1 = max(min(i1-context-ins, start1), 0)
start2 = max(min(j1-context-ins, start2), 0)
end1 = min(max(i2+context+ins, end1), a_len)
end2 = min(max(j2+context+ins, end2), b_len)
return 's/%s%s%s/%s/' % (
('' if start1 else '^'), a[start1:end1],
('$' if end1 == a_len else ''), b[start2:end2])
def pick_best(title, item1, item2):
"""
Pick best record among two items with identical scores.
"""
def compare(x):
return difflib.SequenceMatcher(None, title.lower(), x.lower()).ratio()
if not item1['title']:
return item2
elif not item2['title']:
return item2
r1 = compare(item1['title'][0])
r2 = compare(item2['title'][0])
if r1 > r2:
return item1
elif r2 > r1:
return item2
else:
# Try to find other discriminating criteria... e.g. prefer journal-articles
if score_type(item1["type"]) > score_type(item2["type"]):
return item1
else:
return item2
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def get_initial_matches(self):
"""
This does the main work of finding matching n-gram sequences between
the texts.
"""
sequence = SequenceMatcher(None,self.textAgrams,self.textBgrams)
matchingBlocks = sequence.get_matching_blocks()
# Only return the matching sequences that are higher than the
# threshold given by the user.
highMatchingBlocks = [match for match in matchingBlocks if match.size > self.threshold]
numBlocks = len(highMatchingBlocks)
if numBlocks > 0:
print('%s total matches found.' % numBlocks, flush=True)
return highMatchingBlocks
def render_diff(old_text, new_text):
print (old_text, old_text.__class__)
print (new_text, new_text.__class__)
sm = difflib.SequenceMatcher(a=old_text, b=new_text)
out_toks = []
for opcode, s1, e1, s2, e2 in sm.get_opcodes():
if opcode == 'equal':
out_toks.append(old_text[s1:e1])
elif opcode == 'insert':
out_toks.append('<span class="insert">' + new_text[s2:e2] + '</span>')
elif opcode == 'delete':
out_toks.append('<span class="delete">' + old_text[s1:e1] + '</span>')
elif opcode == 'replace':
out_toks.append('<span class="delete">' + old_text[s1:e1] + '</span>')
out_toks.append('<span class="insert">' + new_text[s2:e2] + '</span>')
return ''.join(out_toks)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split(r'\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
try:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
except FieldDoesNotExist:
verbose_name = attribute_name
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def get_diff_lines(self):
import difflib
postdiffs = list()
commentdiffs = list()
s = difflib.SequenceMatcher(lambda x: x.isspace(), self.post.code, self.code)
for o in s.get_opcodes():
if o[0] in ('replace','delete'):
postdiffs.append(('mod', s.a[o[1]:o[2]]))
if o[0] in ('replace','insert'):
commentdiffs.append(('mod', s.b[o[3]:o[4]]))
if o[0] == 'equal':
postdiffs.append(('eq', s.a[o[1]:o[2]]))
commentdiffs.append(('eq', s.b[o[3]:o[4]]))
self.__normalize__(postdiffs)
self.__normalize__(commentdiffs)
return (postdiffs,commentdiffs)
def get_custom_path(self, searchtitle, title):
'''locate custom folder on disk as pvrart location'''
title_path = ""
custom_path = self._mutils.addon.getSetting("pvr_art_custom_path")
if custom_path and self._mutils.addon.getSetting("pvr_art_custom") == "true":
delim = "\\" if "\\" in custom_path else "/"
dirs = xbmcvfs.listdir(custom_path)[0]
for strictness in [1, 0.95, 0.9, 0.8]:
if title_path:
break
for directory in dirs:
if title_path:
break
directory = directory.decode("utf-8")
curpath = os.path.join(custom_path, directory) + delim
for item in [title, searchtitle]:
match = SM(None, item, directory).ratio()
if match >= strictness:
title_path = curpath
break
if not title_path and self._mutils.addon.getSetting("pvr_art_download") == "true":
title_path = os.path.join(custom_path, normalize_string(title)) + delim
return title_path
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def ratio(s1, s2):
if s1 is None:
raise TypeError("s1 is None")
if s2 is None:
raise TypeError("s2 is None")
s1, s2 = utils.make_type_consistent(s1, s2)
if len(s1) == 0 or len(s2) == 0:
return 0
m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.ratio())
# todo: skip duplicate indexes for a little more speed
def P_update_tree(self, user, archive_path): # private, plex can't use _var
"""update the cache of the dir read state for everything between cb_path and archive_path."""
Log.Debug('updating tree {}'.format(archive_path))
base = Prefs['cb_path']
x = difflib.SequenceMatcher(a=base, b=archive_path)
for tag, i1, i2, j1, j2 in x.get_opcodes():
if tag == 'insert':
try:
diff = os.path.split(archive_path[j1:j2])[0]
d = diff.replace('\\', '/').split('/')[1]
path = os.path.join(base, d)
Log.Debug('archive root: {}'.format(path))
if os.path.abspath(base) == os.path.abspath(path):
Log.Debug('item is in root dir. skipping.')
else:
state = self.dir_read_state(user, path, True)
except Exception as e:
Log.Error('P_update_tree {}'.format(e))
return
def validate(self, password, user=None):
if not user:
return
for attribute_name in self.user_attributes:
value = getattr(user, attribute_name, None)
if not value or not isinstance(value, string_types):
continue
value_parts = re.split('\W+', value) + [value]
for value_part in value_parts:
if SequenceMatcher(a=password.lower(), b=value_part.lower()).quick_ratio() > self.max_similarity:
verbose_name = force_text(user._meta.get_field(attribute_name).verbose_name)
raise ValidationError(
_("The password is too similar to the %(verbose_name)s."),
code='password_too_similar',
params={'verbose_name': verbose_name},
)
def reset(self):
"""
Resets thread data model
"""
self.disableStdOut = False
self.hashDBCursor = None
self.inTransaction = False
self.lastComparisonPage = None
self.lastComparisonHeaders = None
self.lastErrorPage = None
self.lastHTTPError = None
self.lastRedirectMsg = None
self.lastQueryDuration = 0
self.lastRequestMsg = None
self.lastRequestUID = 0
self.lastRedirectURL = None
self.resumed = False
self.retriesCount = 0
self.seqMatcher = difflib.SequenceMatcher(None)
self.shared = shared
self.valueStack = []
def get_relevant_entities(self, google_cloud_entities, target_entities, target_wikipedia_urls):
entities_to_return = []
target_wikipedia_urls_lower = [target_wikipedia_url.lower() for target_wikipedia_url in target_wikipedia_urls]
for google_cloud_entity in google_cloud_entities:
# Look at Wikipedia URLs
if google_cloud_entity.wikipedia_url and google_cloud_entity.wikipedia_url.lower() in target_wikipedia_urls_lower:
entities_to_return.append(google_cloud_entity.name)
continue
# Look at names
a = google_cloud_entity.name.lower().split(" ")
for target_entity in target_entities:
b = target_entity.lower().split(" ")
if google_cloud_entity in entities_to_return:
break
for google_cloud_entity_part in a:
for target_entity_part in b:
ratio = SequenceMatcher(None, google_cloud_entity_part, target_entity_part).ratio()
if ratio > 0.7:
entities_to_return.append(google_cloud_entity.name)
break
if google_cloud_entity in entities_to_return:
break
return entities_to_return
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
def closest_rule(self, adapter):
def score_rule(rule):
return sum([
0.98 * difflib.SequenceMatcher(
None, rule.endpoint, self.endpoint
).ratio(),
0.01 * bool(set(self.values or ()).issubset(rule.arguments)),
0.01 * bool(rule.methods and self.method in rule.methods)
])
if adapter and adapter.map._rules:
return max(adapter.map._rules, key=score_rule)
else:
return None
def global_search(cls, text, limit, menu='ir.ui.menu'):
"""
Search on models for text including menu
Returns a list of tuple (ratio, model, model_name, id, name, icon)
The size of the list is limited to limit
"""
pool = Pool()
ModelAccess = pool.get('ir.model.access')
if not limit > 0:
raise ValueError('limit must be > 0: %r' % (limit,))
models = cls.search(['OR',
('global_search_p', '=', True),
('model', '=', menu),
])
access = ModelAccess.get_access([m.model for m in models])
s = StringMatcher()
if isinstance(text, str):
text = text.decode('utf-8')
s.set_seq2(text)
def generate():
for model in models:
if not access[model.model]['read']:
continue
Model = pool.get(model.model)
if not hasattr(Model, 'search_global'):
continue
for record, name, icon in Model.search_global(text):
if isinstance(name, str):
name = name.decode('utf-8')
s.set_seq1(name)
yield (s.ratio(), model.model, model.rec_name,
record.id, name, icon)
return heapq.nlargest(int(limit), generate())