def createData():
spwords = [unidecode(a.lower()) for a in set(nltk.corpus.cess_esp.words()) if len(a)>3]
enwords = [a.lower() for a in set(nltk.corpus.brown.words()) if len(a)>3]
jpwords = [unidecode(a) for a in jeita.words() if (len(unidecode(a)) and unidecode(a)[0].islower())]
jpwords = [a for a in set(jpwords) if len(a)>3]
# minLen = min(len(enwords), len(spwords), len(jpwords))
featuresets = \
[(createTupleDict(w,numChars),'English') for w in enwords] + \
[(createTupleDict(w,numChars),'Spanish') for w in spwords] + \
[(createTupleDict(w,numChars),'Japanese') for w in jpwords]
random.shuffle(featuresets)
l=int(len(featuresets)*0.8)
training_set = featuresets[:l]
testing_set = featuresets[l:]
return (training_set, testing_set)
python类corpus()的实例源码
def most_frequent_Brown_Corpus_words():
import nltk
import nltk.corpus
words = []
for word in nltk.corpus.brown.words():
if word not in [
",",
".",
"``",
"''",
";",
"?",
"--",
")",
"(",
":",
"!"
]:
words.append(word.lower())
frequencies_words = nltk.FreqDist(words).most_common()
words_most_frequent = [word[0] for word in frequencies_words]
return words_most_frequent
def brown_data():
"""return the text_length first tokens of the brown corpus tagged in pyrata format"""
tokens = brown.words()
tokens = tokens[:text_length]
pos_tags = nltk.pos_tag(tokens)
return [{'raw':w, 'pos':p} for (w, p) in pos_tags]
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# TEST
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
def get_word_clouds(tweets, users, words_n=50, lang='english'):
default_stopwords = set(nltk.corpus.stopwords.words(lang))
stopwords_file = '../data/stopwords.txt'
custom_stopwords = set(open(stopwords_file, 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords))
X = vectorizer.fit_transform(tweets)
terms = vectorizer.get_feature_names()
word_cloud_per_person = {}
for doc in range(len(tweets)):
feature_index = X[doc, :].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
doc_terms = []
for word, score in [(terms[i], score) for (i, score) in tfidf_scores]:
doc_terms.append((word, score))
important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n]
word_cloud_per_person[users[doc]] = important_terms
return word_cloud_per_person
def get_corpus_of_most_active_users(n_users=5):
tweets = []
texts = []
with open(DATASET_PATH) as f:
for line in f:
tweets.append(json.loads(line)['user']['screen_name'])
texts.append((json.loads(line)['user']['screen_name'], json.loads(line)['text']))
users = nltk.FreqDist(tweets).most_common(n_users)
dict = {}
for user, tweet in texts:
if user in dict:
dict[user] = " ".join([dict[user],tweet])
else:
dict[user] = tweet
corpus = [dict[name] for name, _ in users]
user_names = [name for name, _ in users]
return corpus, user_names
def cut_low_freq(self, corpus, threshold=1):
new_vocas = []
new_docfreq = []
self.vocas_id = dict()
conv_map = dict()
for id, term in enumerate(self.vocas):
freq = self.docfreq[id]
if freq > threshold:
new_id = len(new_vocas)
self.vocas_id[term] = new_id
new_vocas.append(term)
new_docfreq.append(freq)
conv_map[id] = new_id
self.vocas = new_vocas
self.docfreq = new_docfreq
return np.array([ self.conv(doc, conv_map) for doc in corpus])
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic
def lin_similarity(self, other, ic, verbose=False):
"""
Lin Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects,
in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return (2.0 * lcs_ic) / (ic1 + ic2)
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(clause(r, relsym='DE'))
print()
def create_corpus(fileids, max_length=None):
"""
Creates a corpus from fileids
Removes stopwords and punctuation
Returns a list of strings
"""
sw = set(stopwords.words("english"))
tokenizer = nltk.tokenize.RegexpTokenizer(r"[A-Za-z]+")
corpus = []
for doc in fileids:
words = (w.lower() for w in tokenizer.tokenize(reuters.raw(doc)))
words = [w for w in words if w not in sw]
if max_length:
words = words[:max_length]
corpus.append(" ".join(words))
return corpus
wordnet.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 40
收藏 0
点赞 0
评论 0
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
wordnet.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic
wordnet.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 40
收藏 0
点赞 0
评论 0
def lin_similarity(self, other, ic, verbose=False):
"""
Lin Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects,
in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return (2.0 * lcs_ic) / (ic1 + ic2)
relextract.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
relextract.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(clause(r, relsym='DE'))
print()
glue.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
def info_content(lookup_word):
"""
Uses the Brown corpus available in NLTK to calculate a Laplace
smoothed frequency distribution of words, then uses this information
to compute the information content of the lookup_word.
"""
global N
if N == 0:
# poor man's lazy evaluation
print "I SHOULD BE PRINTED ONLY ONCE"
for sent in brown.sents():
for word in sent:
word = word.lower()
if not brown_word_counter.has_key(word):
brown_word_counter[word] = 0
brown_word_counter[word] = brown_word_counter[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not brown_word_counter.has_key(lookup_word) else brown_word_counter[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic
def lin_similarity(self, other, ic, verbose=False):
"""
Lin Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects,
in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return (2.0 * lcs_ic) / (ic1 + ic2)
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(clause(r, relsym='DE'))
print()
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
short_sentence_similarity.py 文件源码
项目:Semantic-Texual-Similarity-Toolkits
作者: rgtjf
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def info_content(lookup_word):
"""
Uses the Brown corpus available in NLTK to calculate a Laplace
smoothed frequency distribution of words, then uses this information
to compute the information content of the lookup_word.
"""
global N
if N == 0:
# poor man's lazy evaluation
for sent in brown.sents():
for word in sent:
word = word.lower()
if not word in brown_freqs:
brown_freqs[word] = 0
brown_freqs[word] = brown_freqs[word] + 1
N = N + 1
lookup_word = lookup_word.lower()
n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
return 1.0 - (math.log(n + 1) / math.log(N + 1))
def normalize_corpus(corpus, lemmatize=True, tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
def normalize_corpus(corpus, lemmatize=True,
only_text_chars=False,
tokenize=False):
normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if only_text_chars:
text = keep_text_characters(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)
return normalized_corpus
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic