python类corpus()的实例源码

wordnet.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 45 收藏 0 点赞 0 评论 0
def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)
relextract.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
relextract.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()
glue.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger
final_crime_result.py 文件源码 项目:goal 作者: victorskl 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set
final_crime_result.py 文件源码 项目:goal 作者: victorskl 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def store_synset_primarySense(word):
    result = {}
    check_item = sort_orderedDict(primary_sense(word.lower()))
    if len(check_item)==1:
        if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v':
                result[word] = wn.lemma_from_key(check_item.keys()[0]).synset()
    elif len(check_item)>1:
        for index in range(len(check_item.keys())):
            try:
                if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v':
                    result[word] = wn.lemma_from_key(check_item.keys()[index]).synset()
                    continue
            except nltk.corpus.reader.wordnet.WordNetError:
                continue
            else:
                pass
    else:
        return 0
    return result

#use the lemmatizer defined in the previous workshop
crime.py 文件源码 项目:goal 作者: victorskl 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def preprocess(content):
    word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

    words_set = []
    for twitter in content:
        words_set += (word_tokenizer.tokenize(twitter['twitter_content']))
    words_set = list(set(words_set))

    stop_words = stopwords.words('english')
    non_words = list(punctuation)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

    # only need the alphabetic word
    formartted_twitter_words_set = []
    for word in words_set:
        if (word.isalpha() != False) and (word not in non_words) and (word not in stop_words):
            formartted_twitter_words_set.append(lemmatizer.lemmatize(word))

    nltk_words_set = list(set(nltk.corpus.words.words()))
    # training whole set
    training_set = formartted_twitter_words_set + nltk_words_set
    return training_set
crime.py 文件源码 项目:goal 作者: victorskl 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def store_synset_primarySense(word):
    result = {}
    check_item = sort_orderedDict(primary_sense(word.lower()))
    if len(check_item)==1:
        if wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[0]).synset().pos() == 'v':
                result[word] = wn.lemma_from_key(check_item.keys()[0]).synset()
    elif len(check_item)>1:
        for index in range(len(check_item.keys())):
            try:
                if wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'n' or wn.lemma_from_key(check_item.keys()[index]).synset().pos() == 'v':
                    result[word] = wn.lemma_from_key(check_item.keys()[index]).synset()
                    continue
            except nltk.corpus.reader.wordnet.WordNetError:
                continue
            else:
                pass
    else:
        return 0
    return result

#use the lemmatizer defined in the previous workshop
learn.py 文件源码 项目:partisan-discourse 作者: DistrictDataLabs 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def documents(self, fold=None, train=False, test=False):
        """
        A generator of documents being streamed from disk. Each document is
        a list of paragraphs, which are a list of sentences, which in turn is
        a list of tuples of (token, tag) pairs. All preprocessing is done by
        NLTK and the CorpusReader object this object wraps.

        If a fold is specified (should be an integer between 0 and folds),
        then the loader will return documents from that fold. Further, train
        or test must be specified to split the fold correctly. This method
        allows us to maintain the generator properties of document reads.
        """
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.tagged(fileids=fileid))


##########################################################################
## Normalize Transformer
##########################################################################
code4lib_brainforks.py 文件源码 项目:brainforks 作者: minervax 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def swoogle(query, termbool):

    extraselectors = []
    if termbool is True:
        conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=100&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower())
    else:
        conceptSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+query.lower()+'&pos=NN&N=30&sim_type=concept&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+query.lower())

    #relationSim = requests.get('http://swoogle.umbc.edu/SimService/GetSimilarity?operation=top_sim&word='+sys.argv[1]+'&pos=NN&N=100&sim_type=relation&corpus=webbase&query=Get+Top-N+Most+Similar+Words'+sys.argv[1])

    conceptSoup = BeautifulSoup(conceptSim.text)
    conceptTextArea = conceptSoup.findAll("textarea")
    conceptText = conceptTextArea[0].contents[0]


    lines = conceptText.split(",")
    for line in lines:
        line = line.strip()
        parts = line.split("_")

        extraselectors.append(parts[0])

    return extraselectors
seedev_corpus.py 文件源码 项目:IBRel 作者: lasigeBioTM 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def add_more_sentences(self, corpuspath):
        """
        Load sentences with relations from another corpus
        :param corpuspath: corpus path
        :return:
        """
        nsentences = 0
        for did in self.documents:
            nsentences += len(self.documents[did].sentences)
        print "base corpus has {} sentences".format(nsentences)
        corpus2 = pickle.load(open(corpuspath, 'rb'))
        nsentences = 0
        for did in corpus2.documents:
            if did in self.documents:
                print "repeated did:", did
            else:
                self.documents[did] = corpus2.documents[did]
                nsentences += len(corpus2.documents[did].sentences)
            #for sentence in corpus2.documents[did].sentences:
                #if any([len(e.targets)> 1 for e in sentence.entities.elist["goldstandard"]]):
                #    print "found sentence with relations:", sentence.sid
                #if len(sentence.entities.elist["goldstandard"]) > 1:
                #self.documents[sentence.sid] = Document(sentence.text, sentences=[sentence])
        print "added {} sentences".format(nsentences)
        self.save("corpora/Thaliana/seedev-extended.pickle")
wordnet.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset
wordnet.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic
wordnet.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)
relextract.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)  

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
relextract.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()
semantic_similarity_measure.py 文件源码 项目:twitter-trends-summarizer 作者: yuva29 项目源码 文件源码 阅读 58 收藏 0 点赞 0 评论 0
def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not brown_freqs.has_key(word):
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not brown_freqs.has_key(lookup_word) else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))
utils.py 文件源码 项目:nlpSentiment 作者: ClimbsRocks 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def createPopularWords(combined, lowerBound, upperBound):
    allWords = []
    for message in combined:
        for word in message[0]:
            allWords.append(word)

    allWords = nltk.FreqDist(allWords)


    # grab the top several thousand words, ignoring the lowerBound most popular
    # grabbing more words leads to more accurate predictions, at the cost of both memory and compute time
    # ignoring the x most popular words is an easy method for handling stop words that are specific to this dataset, rather than just the English language overall
    popularWords = []
    wordsToUse = allWords.most_common(upperBound)[lowerBound:upperBound]
    for pair in wordsToUse:
        popularWords.append(pair[0])

    return popularWords


# extract features from a single document in a consistent manner for all documents in a corpus
# simply returns whether a given word in popularWords is included in the document
wordnet.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset
wordnet.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic


问题


面经


文章

微信
公众号

扫码关注公众号