def lin_similarity(self, other, ic, verbose=False):
"""
Lin Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects,
in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return (2.0 * lcs_ic) / (ic1 + ic2)
python类corpus()的实例源码
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(clause(r, relsym='DE'))
print()
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
def preprocess(content):
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
words_set = []
for twitter in content:
words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
words_set = list(set(words_set))
stop_words = stopwords.words('english')
non_words = list(punctuation)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# only need the alphabetic word
formartted_twitter_words_set = []
for word in words_set:
if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
formartted_twitter_words_set.append(lemmatizer.lemmatize(word))
nltk_words_set = list(set(nltk.corpus.words.words()))
# training whole set
training_set = formartted_twitter_words_set + nltk_words_set
return training_set
def store_synset_primarySense(word):
result = {}
check_item = sort_orderedDict(primary_sense(word.lower()))
if len(check_item)==1:
if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v':
result[word] = wn.lemma_from_key(check_item.keys()[0]).synset()
elif len(check_item)>1:
for index in range(len(check_item.keys())):
try:
if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v':
result[word] = wn.lemma_from_key(check_item.keys()[index]).synset()
continue
except nltk.corpus.reader.wordnet.WordNetError:
continue
else:
pass
else:
return 0
return result
#use the lemmatizer defined in the previous workshop
def preprocess(content):
word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
words_set = []
for twitter in content:
words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
words_set = list(set(words_set))
stop_words = stopwords.words('english')
non_words = list(punctuation)
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
# only need the alphabetic word
formartted_twitter_words_set = []
for word in words_set:
if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
formartted_twitter_words_set.append(lemmatizer.lemmatize(word))
nltk_words_set = list(set(nltk.corpus.words.words()))
# training whole set
training_set = formartted_twitter_words_set + nltk_words_set
return training_set
def store_synset_primarySense(word):
result = {}
check_item = sort_orderedDict(primary_sense(word.lower()))
if len(check_item)==1:
if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v':
result[word] = wn.lemma_from_key(check_item.keys()[0]).synset()
elif len(check_item)>1:
for index in range(len(check_item.keys())):
try:
if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v':
result[word] = wn.lemma_from_key(check_item.keys()[index]).synset()
continue
except nltk.corpus.reader.wordnet.WordNetError:
continue
else:
pass
else:
return 0
return result
#use the lemmatizer defined in the previous workshop
def documents(self, fold=None, train=False, test=False):
"""
A generator of documents being streamed from disk. Each document is
a list of paragraphs, which are a list of sentences, which in turn is
a list of tuples of (token, tag) pairs. All preprocessing is done by
NLTK and the CorpusReader object this object wraps.
If a fold is specified (should be an integer between 0 and folds),
then the loader will return documents from that fold. Further, train
or test must be specified to split the fold correctly. This method
allows us to maintain the generator properties of document reads.
"""
for fileid in self.fileids(fold, train, test):
yield list(self.corpus.tagged(fileids=fileid))
##########################################################################
## Normalize Transformer
##########################################################################
def swoogle(query, termbool):
extraselectors = []
if termbool is True:
conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=100&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower())
else:
conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=30&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower())
#relationSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+sys.argv[1]+'&pos=NN&N=100&sim_type=relation&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+sys.argv[1])
conceptSoup = BeautifulSoup(conceptSim.text)
conceptTextArea = conceptSoup.findAll("textarea")
conceptText = conceptTextArea[0].contents[0]
lines = conceptText.split(",")
for line in lines:
line = line.strip()
parts = line.split("_")
extraselectors.append(parts[0])
return extraselectors
def add_more_sentences(self, corpuspath):
"""
Load sentences with relations from another corpus
:param corpuspath: corpus path
:return:
"""
nsentences = 0
for did in self.documents:
nsentences += len(self.documents[did].sentences)
print "base corpus has {} sentences".format(nsentences)
corpus2 = pickle.load(open(corpuspath, 'rb'))
nsentences = 0
for did in corpus2.documents:
if did in self.documents:
print "repeated did:", did
else:
self.documents[did] = corpus2.documents[did]
nsentences += len(corpus2.documents[did].sentences)
#for sentence in corpus2.documents[did].sentences:
#if any([len(e.targets)> 1 for e in sentence.entities.elist["goldstandard"]]):
# print "found sentence with relations:", sentence.sid
#if len(sentence.entities.elist["goldstandard"]) > 1:
#self.documents[sentence.sid] = Document(sentence.text, sentences=[sentence])
print "added {} sentences".format(nsentences)
self.save("corpora/Thaliana/seedev-extended.pickle")
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic
def lin_similarity(self, other, ic, verbose=False):
"""
Lin Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects,
in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return (2.0 * lcs_ic) / (ic1 + ic2)
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(clause(r, relsym='DE'))
print()
semantic_similarity_measure.py 文件源码
项目:twitter-trends-summarizer
作者: yuva29
项目源码
文件源码
阅读 58
收藏 0
点赞 0
评论 0
def info_content(lookup_word):
"""
Uses the Brown corpus available in NLTK to calculate a Laplace
smoothed frequency distribution of words, then uses this information
to compute the information content of the lookup_word.
"""
global N
if N == 0:
# poor man's lazy evaluation
for sent in brown.sents():
for word in sent:
word = word.lower()
if not brown_freqs.has_key(word):
brown_freqs[word] = 0
brown_freqs[word] = brown_freqs[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not brown_freqs.has_key(lookup_word) else brown_freqs[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
def createPopularWords(combined, lowerBound, upperBound):
allWords = []
for message in combined:
for word in message[0]:
allWords.append(word)
allWords = nltk.FreqDist(allWords)
# grab the top several thousand words, ignoring the lowerBound most popular
# grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
# ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
popularWords = []
wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
for pair in wordsToUse:
popularWords.append(pair[0])
return popularWords
# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic