python类corpus()的实例源码-第2页-面圈网

wordnet.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)

relextract.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################

relextract.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()

glue.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger

final_crime_result.py 文件源码项目：goal 作者: victorskl 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set

final_crime_result.py 文件源码项目：goal 作者: victorskl 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def store_synset_primarySense(word):
    result = {}
    check_item = sort_orderedDict(primary_sense(word.lower()))
    if len(check_item)==1:
        if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v':
                result[word] = wn.lemma_from_key(check_item.keys()[0]).synset()
    elif len(check_item)>1:
        for index in range(len(check_item.keys())):
            try:
                if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v':
                    result[word] = wn.lemma_from_key(check_item.keys()[index]).synset()
                    continue
            except nltk.corpus.reader.wordnet.WordNetError:
                continue
            else:
                pass
    else:
        return 0
    return result

#use the lemmatizer defined in the previous workshop

crime.py 文件源码项目：goal 作者: victorskl 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set

crime.py 文件源码项目：goal 作者: victorskl 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def store_synset_primarySense(word):
    result = {}
    check_item = sort_orderedDict(primary_sense(word.lower()))
    if len(check_item)==1:
        if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v':
                result[word] = wn.lemma_from_key(check_item.keys()[0]).synset()
    elif len(check_item)>1:
        for index in range(len(check_item.keys())):
            try:
                if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v':
                    result[word] = wn.lemma_from_key(check_item.keys()[index]).synset()
                    continue
            except nltk.corpus.reader.wordnet.WordNetError:
                continue
            else:
                pass
    else:
        return 0
    return result

#use the lemmatizer defined in the previous workshop

learn.py 文件源码项目：partisan-discourse 作者: DistrictDataLabs 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def documents(self, fold=None, train=False, test=False):
        """
        A generator of documents being streamed from disk. Each document is
        a list of paragraphs, which are a list of sentences, which in turn is
        a list of tuples of (token, tag) pairs. All preprocessing is done by
        NLTK and the CorpusReader object this object wraps.

        If a fold is specified (should be an integer between 0 and folds),
        then the loader will return documents from that fold. Further, train
        or test must be specified to split the fold correctly. This method
        allows us to maintain the generator properties of document reads.
        """
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.tagged(fileids=fileid))


##########################################################################
## Normalize Transformer
##########################################################################

code4lib_brainforks.py 文件源码项目：brainforks 作者: minervax 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def swoogle(query, termbool):

    extraselectors = []
    if termbool is True:
        conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=100&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower())
    else:
        conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=30&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower())

    #relationSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+sys.argv[1]+'&pos=NN&N=100&sim_type=relation&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+sys.argv[1])

    conceptSoup = BeautifulSoup(conceptSim.text)
    conceptTextArea = conceptSoup.findAll("textarea")
    conceptText = conceptTextArea[0].contents[0]


    lines = conceptText.split(",")
    for line in lines:
        line = line.strip()
        parts = line.split("_")

        extraselectors.append(parts[0])

    return extraselectors

seedev_corpus.py 文件源码项目：IBRel 作者: lasigeBioTM 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def add_more_sentences(self, corpuspath):
        """
        Load sentences with relations from another corpus
        :param corpuspath: corpus path
        :return:
        """
        nsentences = 0
        for did in self.documents:
            nsentences += len(self.documents[did].sentences)
        print "base corpus has {} sentences".format(nsentences)
        corpus2 = pickle.load(open(corpuspath, 'rb'))
        nsentences = 0
        for did in corpus2.documents:
            if did in self.documents:
                print "repeated did:", did
            else:
                self.documents[did] = corpus2.documents[did]
                nsentences += len(corpus2.documents[did].sentences)
            #for sentence in corpus2.documents[did].sentences:
                #if any([len(e.targets)> 1 for e in sentence.entities.elist["goldstandard"]]):
                #    print "found sentence with relations:", sentence.sid
                #if len(sentence.entities.elist["goldstandard"]) > 1:
                #self.documents[sentence.sid] = Document(sentence.text, sentences=[sentence])
        print "added {} sentences".format(nsentences)
        self.save("corpora/Thaliana/seedev-extended.pickle")

wordnet.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset

wordnet.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic

wordnet.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)

relextract.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################

relextract.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()

semantic_similarity_measure.py 文件源码项目：twitter-trends-summarizer 作者: yuva29 项目源码文件源码阅读 58 收藏 0 点赞 0 评论 0

def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not brown_freqs.has_key(word):
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not brown_freqs.has_key(lookup_word) else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))

utils.py 文件源码项目：nlpSentiment 作者: ClimbsRocks 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def createPopularWords(combined, lowerBound, upperBound):
    allWords = []
    for message in combined:
        for word in message[0]:
            allWords.append(word)

    allWords = nltk.FreqDist(allWords)


    # grab the top several thousand words, ignoring the lowerBound most popular
    # grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
    # ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
    popularWords = []
    wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
    for pair in wordsToUse:
        popularWords.append(pair[0])

    return popularWords


# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document

wordnet.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset

wordnet.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic