python类RegexpTokenizer()的实例源码-面圈网

aligned.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

aligned.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

aligned.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

topic_modeler.py 文件源码项目：Artificial-Intelligence-with-Python 作者: PacktPublishing 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')

    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]

    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed

util.py 文件源码项目：sentiment-analysis 作者: saber1988 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def paragraph_to_words(paragraph, remove_stopwords=False, lemmatize=True, stem=False):
    words = BeautifulSoup(paragraph["review"], "html.parser").get_text()
    words = re.sub("[^a-zA-Z]", " ", words)
    # tokenizer = RegexpTokenizer(r'\w+')
    # words = tokenizer.tokenize(words.strip().lower())
    words = words.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    if lemmatize:
        words = [lemmatizer.lemmatize(w) for w in words]

    if stem:
        words = [stemmer.stem(w) for w in words]

    return LabelDoc(words, paragraph["id"])

aligned.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

aligned.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

aligned.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

aligned.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

aligned.py 文件源码项目：but_sentiment 作者: MixedEmotions 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader

charcnn_text_classifier.py 文件源码项目：TensorGraph 作者: hycis 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def tweets(word_len, sent_len, train_valid_ratio=[5,1]):
    df = pandas.read_csv('tweets_large.csv')
    field = 'text'
    label = 'label'
    tokenizer = RegexpTokenizer(r'\w+')

    # encode characters into numbers
    encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer,
                                word_len=word_len, sent_len=sent_len)
    encoder.build_char_map()
    encode_X = encoder.make_char_embed()

    # encode categories into one hot array
    cat_encoder = CatNumberEncoder(df[label])
    cat_encoder.build_cat_map()

    encode_y = cat_encoder.make_cat_embed()
    nclass = len(np.unique(encode_y))
    encode_y = make_one_hot(encode_y, nclass)

    return encode_X, encode_y, nclass

raw_analysis.py 文件源码项目：AirbnbReviewAnalyzer 作者: mrsata 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def analysis(reviews_collection_text):
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        raw_data = f.read()
    with open('data/reviews_%s' % reviews_collection_text, 'r') as f:
        comments = f.readlines()
    data = raw_data.replace('\n', ' ')
    data_lower = data.lower()
    tokens_with_punc = word_tokenize(data_lower)
    tokens = RegexpTokenizer(r'\w+').tokenize(data_lower)
    print("--- Most frequent tokens ---\n",
        FreqDist(tokens_with_punc).most_common(15))
    print("--- Tokens without punctuation ---\n",
        FreqDist(tokens).most_common(15))
    stop = set(stopwords.words('english'))
    words = [word for word in tokens if word not in stop]
    print("--- Most frequent words ---\n", FreqDist(words).most_common(15))
    tagged = pos_tag(words)
    nouns = [word for word, pos in tagged if (pos == 'NN')]
    print("--- Most frequent nouns ---\n", FreqDist(nouns).most_common(15))
    adjts = [word for word, pos in tagged if (pos == 'JJ')]
    print("--- Most frequent adjective ---\n", FreqDist(adjts).most_common(15))
    tokns = [RegexpTokenizer(r'\w+').tokenize(comment) for comment in comments]
    lxdst = [lexical_density(token) for token in tokns if len(token) > 0]
    avgld = sum(lxdst) / len(comments)
    print("--- Average lexical density ---\n", avgld)

nlp_utils.py 文件源码项目：search_relevance 作者: rmanak 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __init__(self, fname):
        words_map = {}
        for line in csv.reader(open(fname)):
            word, syn = line
            if word.startswith('#'):
                continue
            words_map[word] = syn
        super(CSVWordReplacer, self).__init__(words_map)


######### for now just a wrapper to RegexpTokenizer #########

nlp_utils.py 文件源码项目：search_relevance 作者: rmanak 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self,pattern):
        self.pattern = pattern 
        self.tokenizer = RegexpTokenizer(self.pattern)



######## defining a default stopwords set #############

utils.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def rm_punctuation(data,pattern=r'[a-zA-Z]+-?[0-9]*',silent=1):
    if silent==0:
        print("remove punctuation ...")
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(pattern)
    return tokenizer.tokenize(" ".join(data))

topic_modeling.py 文件源码项目：Python-Machine-Learning-Cookbook 作者: PacktPublishing 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')

    # Tokenizing, stop word removal, and stemming

main.py 文件源码项目：AcronymExpansion 作者: adityathakker 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def get_list():
    stop_words = set(stopwords.words('english'))

    filename = 'data/new_acronyms.json'
    f = open(filename, 'r')
    data = json.load(f)
    paragraph_list = []
    full_form_list = []
    for k,v in data.items():
        if k=="WDM":
            for poss in v['possibilities']:
                paragraph_list.append(poss['summary'])
                full_form_list.append(poss['full_form'])
    s="two devices can also function as an add/drop multiplexer (ADM), i.e. simultaneously adding light beams while dropping other light beams and rerouting them to other destinations and devices. Formerly, such filtering of light beams was done with etalons, devices called Fabry–Pérot interferometers using thin-film-coated optical glass. The first WDM technology was conceptualized in the early 1970s and realized in the laboratory in the late 1970s; but these only combined two signals, and many years later were still very expensive.As of 2011, WDM systems can handle 160 signals, which will expand a 10 Gbit/second system with a single fiber optic pair of conductors to more than 1.6 Tbit/second (i.e. 1,600 Gbit/s).Typical WDM systems use single-mode optical fiber (SMF); this is optical fiber for only a single ray of light and having a core diameter of 9 millionths of a meter (9 µm). Other systems with multi-mode fiber cables (MM Fiber; also called premises cables) have core diameters of about 50 µm. Standardization and extensive research have brought down system costs significantly."
    paragraph_list.append(s)
    full_form_list.append("Wavelength context")
    texts = []
    taggeddoc = []
    p_stemmer = PorterStemmer()
    tokeniser = RegexpTokenizer(r'\w+')

    for index, para in enumerate(paragraph_list):
        raw = para.lower()

        tokens = tokeniser.tokenize(raw)
        stopped_tokens = [t for t in tokens if not t in stop_words]

        number_tokens = [x for x in stopped_tokens if x.isalpha]
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]

        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        texts.append(length_tokens)
        td = TaggedDocument(' '.join(stemmed_tokens).split(), [full_form_list[index]])

        taggeddoc.append(td)

    return taggeddoc

summarize.py 文件源码项目：QProb 作者: quant-trade 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def get_summarized(self, input_data, num_sentences):
        # TODO: allow the caller to specify the tokenizer they want
        # TODO: allow the user to specify the sentence tokenizer they want
        # TODO multilingual!

        tokenizer = RegexpTokenizer('\w+')

        stopwords_ = [smart_text(word) for word in stopwords.words('english')]

        # get the frequency of each word in the input
        base_words = [smart_text(word.lower()) for word in tokenizer.tokenize(smart_text(input_data))]
        words = [smart_text(word) for word in base_words if word not in stopwords_]
        word_frequencies = FreqDist(words)

        # now create a set of the most frequent words
        most_frequent_words = [pair[0] for pair in list(word_frequencies.items())[:100]]

        # break the input up into sentences.  working_sentences is used
        # for the analysis, but actual_sentences is used in the results
        # so capitalization will be correct.

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        actual_sentences = sent_detector.tokenize(input_data)
        working_sentences = [sentence.lower() for sentence in actual_sentences]

        # iterate over the most frequent words, and add the first sentence
        # that inclues each word to the result.
        output_sentences = []

        for word in most_frequent_words:
            for i in range(0, len(working_sentences)):
                if (word in working_sentences[i] and actual_sentences[i] not in output_sentences):
                    output_sentences.append(actual_sentences[i])
                    break
                if len(output_sentences) >= num_sentences: break
            if len(output_sentences) >= num_sentences: break

        # sort the output sentences back to their original order
        return self.reorder_sentences(output_sentences=output_sentences, input_data=input_data)

rte_classify.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words

ycoe.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.

data.py 文件源码项目：dong_iccv_2017 作者: woozzu 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def split_sentence_into_words(sentence):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sentence.lower())

preprocess.py 文件源码项目：w2vec-similarity 作者: jayantj 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def remove_punctuation(str):
  tokenizer = RegexpTokenizer(r'\w+')
  return tokenizer.tokenize(str)

dataset.py 文件源码项目：tensorlm 作者: batzner 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def tokenize(text, level):
    """Tokenize a text into a list of strings.

    Args:
        text (str): An arbitrary string.
        level (str): Either "char" or "word". For "char", the string is split into characters. For
            "word", letters and numbers are glued to themselves and everything else is split.
            Example: "asdf df!?123 as12" -> "asdf", " ", "df", "!", "?", "123", " ", "as", "12"

    Returns:
        list[str]: The tokens

    Raises:
        ValueError: If the level is not "char" or "word"
    """

    if level == "char":
        # No need for tokenizing
        return list(text)
    elif level == "word":
        # Tokenize while keeping indentation. Glue letters and numbers to themselves but
        # keep all other chars isolated.
        tokenizer = RegexpTokenizer(r'\w+|\S|\s')
        return tokenizer.tokenize(text)
    else:
        raise ValueError("Unknown token level: {}".format(level))

rte_classify.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words

ycoe.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.

utils.py 文件源码项目：tRECS 作者: TeeOhh 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def remove_punc(string):
    # '''Description: This function takes in a string of descriptions and return a tokenized string without punctuation
    #   Parameters: String of descriptions
    #   Output: Tokenized string with punctuation removed'''
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(string)
    return " ".join(tokens)

rte_classify.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words

ycoe.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe.

sentiment.py 文件源码项目：RottenCrawler 作者: kevin940726 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def getAllReviews(movieList):
    reviews = np.array(map(lambda x: x["reviews"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["score"] >= 30 else 'neg'))

    return tokenizeReview

sentiment.py 文件源码项目：RottenCrawler 作者: kevin940726 项目源码文件源码阅读 69 收藏 0 点赞 0 评论 0

def getAllCritics(movieList):
    reviews = np.array(map(lambda x: x["critics"], movieList))
    reviews = np.concatenate(reviews)

    tokenizeReview = []

    for review in reviews:
        s = review['review']
        s = RegexpTokenizer(r'\w+').tokenize(s.lower())
        s = map(lambda x: PorterStemmer().stem(x), s)
        s = filter(lambda x: x not in stopwords.words('english'), s)
        tokenizeReview.append((s, 'pos' if review["tomatometer"] == "fresh" else 'neg'))

    return tokenizeReview