python类wordpunct_tokenize()的实例源码

data2tensor.py 文件源码 项目:deep-summarization 作者: harpribot 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""
preprocessing.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]
tokenized_text.py 文件源码 项目:Word2Vec 作者: hashbangCoder 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def tokenize(directorys):
    full_content = ''
    for _file in os.listdir(directory):
        #disp_count = 5
        with open(directory+_file,'r') as f:
            contents = f.readlines()
            for item in contents:
                try:
                    sentence = item.split('\t')[1].strip()
                    full_content += sentence
                except IndexError:
                    continue
                # if np.random.binomial(1,0.1):

                #   print sentence
                #   time.sleep(2)               
                #   disp_count -=1 
                #   if not disp_count:
                #       print '*'*100
                #       break

                # else:
                #   print '#'

    return wordpunct_tokenize(full_content.lower())
utils.py 文件源码 项目:pandora 作者: mikekestemont 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def load_unannotated_file(filepath='test.txt', nb_instances=None, tokenized_input=False):
    if tokenized_input:
        instances = []
        for line in codecs.open(filepath, 'r', 'utf8'):
            line = line.strip()
            if line:
                instances.append(line)
            if nb_instances:
                nb_instances -= 1
                if nb_instances <= 0:
                    break
        return instances
    else:
        from nltk.tokenize import wordpunct_tokenize
        W = re.compile(r'\s+')
        with codecs.open(filepath, 'r', 'utf8') as f:
            text = W.sub(f.read(), ' ')
        tokens = wordpunct_tokenize(text)
        if nb_instances:
            return tokens[:nb_instances]
        else:
            return tokens
tokenizers.py 文件源码 项目:pymeetup_morphology 作者: srbutler 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _extract_tokens(self, file_text):
        """Extract tokens from a file and return a Counter dictionary.

        This method is designed specifically so that it can be overridden
        easily while maintaining _get_file_tokens and _get_dir_tokens.
        """

        token_dict = collections.Counter()

        # does a simple word and punctuation tokenization on the text
        tokens = wordpunct_tokenize(file_text)

        for token in tokens:
            token_dict[token] += 1

        return token_dict
tokenizers.py 文件源码 项目:pymeetup_morphology 作者: srbutler 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _extract_tokens(self, file_text):
        """Extract tokens from a Babel file and return a Counter dictionary."""

        token_dict = collections.Counter()

        # matches and removes beginning and end tags
        regex = re.compile(r'\[\d*\.\d*\]\n(.*)')
        matches = regex.findall(file_text)

        tokens = set()
        for match in matches:
            wp_tokenized = wordpunct_tokenize(match)
            tokens.update(wp_tokenized)

        for token in tokens:
            token_dict[token] += 1

        return token_dict
multithreadedwikicrawler.py 文件源码 项目:wikicrawl 作者: rodricios 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _get_revision_word_dist(self, page_title, revid):
        """"""
        revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]

        if revid in revids_to_word_dist:
            return revids_to_word_dist[revid]

        text = self._get_revision_text(page_title, revid)

        text = [word.lower() for word in wordpunct_tokenize(text)
                if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]

        pdist = StatsCounter(text).normalize()

        revids_to_word_dist[revid] = pdist

        return pdist
wikicrawler.py 文件源码 项目:wikicrawl 作者: rodricios 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _get_revision_word_dist(self, page_title, revid):
        """"""
        revids_to_word_dist = self.ctitle_to_revids_to_word_dist[page_title]

        if revid in revids_to_word_dist:
            return revids_to_word_dist[revid]

        text = self._get_revision_text(page_title, revid)

        text = [word.lower() for word in wordpunct_tokenize(text)
                if word.lower() not in STOPWORDS and word.lower() not in PUNCTUATION]

        pdist = StatsCounter(text).normalize()

        revids_to_word_dist[revid] = pdist

        return pdist
asa.py 文件源码 项目:ar-embeddings 作者: iamaziz 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def tokenize(text):
        """
        :param text: a paragraph string
        :return: a list of words
        """

        try:
            try:
                txt = unicode(text, 'utf-8')  # py2
            except NameError:
                txt = text  # py3
            words = wordpunct_tokenize(txt)
            length = len(words)
        except TypeError:
            words, length = ['NA'], 0

        return words, length
utils.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def augment(texts, dic_thes):
    if prm.aug<2:
        return texts

    out = []
    for text in texts:

        words_orig = wordpunct_tokenize(text)
        maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words

        for j in range(prm.aug):
            words = list(words_orig) #copy
            for k in range(randint(1,maxrep)):
                idx = randint(0,len(words)-1)
                word = words[idx]
                if word in dic_thes:

                    synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
                    #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
                    words[idx] = dic_thes[word][synonym]

            out.append(" ".join(words))

    return out
bytileAggregator.py 文件源码 项目:project-fortis 作者: CatalystCode 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, lines):
        self.lookup = {}
        self.max_len = 0        
        ensure_package_path()
        from nltk.tokenize import wordpunct_tokenize as tokenize
        for line in lines:
            word_data = json.loads(line)
            # capture both positive and negative, choose one at scoring time
            pos_score, neg_score = word_data['pos'], word_data['neg']            
            terms = [word_data['word']]
            # TODO: make the sentiment scorer configurable
            if 'word_ar' in word_data:
                terms.append(word_data['word_ar'])
            if 'word_ur' in word_data:
                terms.append(word_data['word_ur'])
            for term in terms:
                # if a scores exists for a term use the least neutral score
                existing_scores = (0., 0.)
                if term in self.lookup:
                    existing_scores = self.lookup[term]
                self.lookup[term] = (max(pos_score, existing_scores[0]), max(neg_score, existing_scores[1]))
                # update the maximum token length to check
                self.max_len = max(len(tokenize(term)), self.max_len)
bytileAggregator.py 文件源码 项目:project-fortis 作者: CatalystCode 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def extract_keywords(sentence, keywords):
    # check if there are keywords for the sentence language
    language = sentence['Language']
    if language in keywords:
        languageKeywords = keywords[language]
        keywordMatches = []
        if languageKeywords != None:
            message = sentence['Sentence']
            # tokenize the sentence
            for keyword in sorted(languageKeywords):
                keywordRegex = languageKeywords[keyword]
                if keywordRegex.search(message):
                    # if match, add keyword canonical form to list
                    keywordMatches.append(keyword)
        sentence['Keywords'] = keywordMatches
    return sentence
utils.py 文件源码 项目:stochasticLDA 作者: qlai 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parseDocument(doc, vocab):
    wordslist = list()
    countslist = list()
    doc = doc.lower()
    tokens = wordpunct_tokenize(doc)

    dictionary = dict()
    for word in tokens:
        if word in vocab:
            wordtk = vocab[word]
            if wordtk not in dictionary:
                dictionary[wordtk] = 1
            else:
                dictionary[wordtk] += 1

    wordslist.append(dictionary.keys())
    countslist.append(dictionary.values())
    return (wordslist[0], countslist[0])
data2tensor.py 文件源码 项目:deep-summarization 作者: harpribot 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __generate_tensor(self, is_review, reverse=False):
        """

        :param is_review:
        :param reverse:
        :return:
        """
        seq_length = self.review_max_words if is_review else self.summary_max_words
        total_rev_summary_pairs = self.rev_sum_pair.shape[0]
        data_tensor = np.zeros([total_rev_summary_pairs,seq_length])

        sample = self.rev_sum_pair[0::, 0] if is_review else self.rev_sum_pair[0::, 1]

        for index, entry in enumerate(sample.tolist()):
            index_lst = np.array([self.map[word.lower()] for word in wordpunct_tokenize(entry)])
            # reverse if want to get backward form
            if reverse:
                index_lst = index_lst[::-1]
            # Pad the list
            if len(index_lst) <= seq_length:
                index_lst = np.lib.pad(index_lst, (0,seq_length - index_lst.size), 'constant', constant_values=(0, 0))
            else:
                index_lst = index_lst[0:seq_length]

            data_tensor[index] = index_lst

        return data_tensor
preprocessing.py 文件源码 项目:KATE 作者: hugochan 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words]
znltk.py 文件源码 项目:csirtg-smrt-py 作者: csirtgadgets 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def top_tokens(text):
    freq_dict = defaultdict(int)
    tokens = wordpunct_tokenize(text)

    for token in tokens:
        freq_dict[token] += 1

    return sorted(freq_dict, key=freq_dict.get, reverse=True)
generate_response.py 文件源码 项目:Alfred 作者: JohnGiorgi 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def wikipediaAction(message):
    """Makes the appropriate calls to the wikipedia API for answer wiki queries.

    Args:
        message: An incoming text message
        processer: Instance of NLProcessor class

    Returns:
        A message indicating what action was taking with the wikipedia API
    """
    # tokenize input
    tokens = tokenize.wordpunct_tokenize(message)
    # filter stopwords, additionally, remove 'wiki' or 'wikipedia'
    tokens_filtered = remove_stopwords(tokens)
    tokens_filtered = [token for token in tokens_filtered if token != 'wiki' and token != 'wikipedia']
    # join filtered message
    message = ' '.join(tokens_filtered)

    # for debugging/testing
    print("(Highly) processed input: ", message)

    # Get the wikipedia summary for the request
    try:
        summary = wikipedia.summary(message, sentences = 1)
        url = wikipedia.page(message).url
        answer = summary + "\nSee more here: " + url
        if len(answer) > 500:
            answer = answer[0:500] + "\nSee wikipedia for more..."
    except:
        # handle all errors
        answer = "Request was not found using Wikipedia. Be more specific?"

    return answer
analytics_platform_util.py 文件源码 项目:fabric8-analytics-stack-analysis 作者: fabric8-analytics 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def create_tags_for_package(package_name):
    """Create tags for a package based on its name."""
    stop_words = set(['org', 'com', 'io', 'ch', 'cn'])
    tags = []
    tags = set([tag.lower() for tag in wordpunct_tokenize(package_name) if
                tag not in string.punctuation and tag not in stop_words
                ])

    return list(tags)[:MAX_TAG_COUNT]
utils.py 文件源码 项目:Word2Vec 作者: hashbangCoder 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def analyze_false(validData,validDataNumbers,validLabels,model):    
    'Calculating precision and recall for best model...'
    predictions = np.squeeze((model.predict(validDataNumbers) > 0.5).astype('int32'))
    c1_inds = np.where(validLabels == 1)[0]
    pos_inds = np.where((predictions+validLabels) == 2)[0] #np.squeeze(predictions) == validLabels
    neg_inds = np.setdiff1d(c1_inds,pos_inds)
    seq_lengths = np.zeros((validData.shape[0]))
    for ind,row in np.ndenumerate(validData):
            seq_lengths[ind] = len(wordpunct_tokenize(row.lower().strip())) 

    mean_true_length = np.mean(seq_lengths[pos_inds])   
    mean_false_length = np.mean(seq_lengths[neg_inds])

    return mean_false_length,mean_true_length
tokenized_text.py 文件源码 项目:Word2Vec 作者: hashbangCoder 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def tokenize(directory,exclude_files):
    full_content = ''
    for _file in os.listdir(directory):
        #disp_count = 5
        if exclude_files  and (_file in exclude_files):
            continue
        with open(directory+_file,'r') as f:
            contents = f.readlines()
            for item in contents:
                try:
                    sentence = item.split('\t')[1].strip()
                    full_content += sentence
                except IndexError:
                    continue
                # if np.random.binomial(1,0.1):

                #   print sentence
                #   time.sleep(2)               
                #   disp_count -=1 
                #   if not disp_count:
                #       print '*'*100
                #       break

                # else:
                #   print '#'

    return wordpunct_tokenize(full_content.lower())
util.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
tokenizers.py 文件源码 项目:pymeetup_morphology 作者: srbutler 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def _extract_tokens(self, file_text):
        """Extract tokens from a file and return a Counter dictionary."""

        token_dict = collections.Counter()

        # matches and removes beginning and end tags
        regex = re.compile(r'(<doc id.*>|<\/doc>)')
        data = regex.sub('', file_text)

        tokens = wordpunct_tokenize(data)

        for token in tokens:
            token_dict[token] += 1

        return token_dict
train_text8model.py 文件源码 项目:w2vec-similarity 作者: jayantj 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_words(sents = []):
  from nltk.tokenize import wordpunct_tokenize
  words = []
  for sent in sents:
    words.append(wordpunct_tokenize(sent))
  return words

# file_name = sys.argv[1]
preprocess.py 文件源码 项目:w2vec-similarity 作者: jayantj 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def tokenize_into_words(sents = []):
  words = []
  for sent in sents:
    words.append(wordpunct_tokenize(sent))
  return words
cavnar_trenkle_impl.py 文件源码 项目:pylade 作者: fievelk 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def _extract_text_ngram_freqs(self, text):
        """Tokenize the text.

        For each token in the text, extract ngrams of different length (from 1
        to 5). Compute how many times each of these ngrams occur in the text.
        Then return a dictionary of { ngram: frequencies }.

        >>> implementation = CavnarTrenkleImpl()
        >>> ngrams = implementation._extract_text_ngram_freqs("HeLLo")
        >>> ngrams == {'h':1, 'e': 1, 'l': 2, 'o': 1, 'he': 1, 'el': 1, 'll': 1, \
            'lo': 1, 'hel': 1, 'ell': 1, 'llo': 1, 'hell': 1, 'ello': 1, 'hello': 1}
        True
        >>> ngrams = implementation._extract_text_ngram_freqs("CIAO")
        >>> ngrams == {'c':1, 'i': 1, 'a': 1, 'o': 1, 'ci': 1, 'ia': 1, 'ao': 1, \
            'cia': 1, 'iao': 1, 'ciao': 1}
        True

        """
        tokens = wordpunct_tokenize(text.lower()) # Force lower case
        # TODO: Delete numbers and punctuation
        # TODO: Should we use nltk twitter tokenizer?

        ngram_freqs = defaultdict(int)
        for token in tokens:
            for n in range(1, 6): # Use 1-grams to 5-grams
                for ngram in ngrams(token, n):
                    ngram_string = ''.join(ngram)
                    ngram_freqs[ngram_string] += 1
                # ngram_freqs[ngrams(token, n)] += 1

        return ngram_freqs
util.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks
utilities.py 文件源码 项目:BioNLP-2016 作者: cambridgeltl 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
        print "text_to_sentence"
        #from nltk.tokenize import wordpunct_tokenize
        # Function to split a review into parsed sentences. Returns a 
        # list of sentences, where each sentence is a list of words
        #
        text=text.decode("utf8")
        from nltk.tokenize import sent_tokenize,wordpunct_tokenize
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        #raw_sentences = tokenizer.tokenize(text.strip())
        raw_sentences = sent_tokenize(text.strip())
        print "finish tokenize sentence",len(raw_sentences)
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:

            #print "sentence:",raw_sentence
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                #sentences.append( text_to_wordlist( raw_sentence, \
    #               remove_stopwords ))
                #print removePunctuation(raw_sentence).lower().split()
                print raw_sentence
                sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
                print wordpunct_tokenize(raw_sentence)
                #print  text_to_wordlist( raw_sentence, remove_stopwords )
        #    
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences
strUtil.py 文件源码 项目:NNED 作者: qolina 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def locateWord(word, wordsArr):
    if word in wordsArr:
        return wordsArr.index(word)
    else:
        idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)]
        return idxs[0]
prepareDataSet_joint.py 文件源码 项目:NNED 作者: qolina 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def negSent2JointTrain(negSents, posSentNum):
    neg_training_data = []
    for sentId, (sent_id, sent) in enumerate(negSents):
        wordsIn = wordpunct_tokenize(sent)
        sent = " ".join(wordsIn)
        eventTypeSequence = ["O" for i in range(len(wordsIn))]
        neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence))
    return neg_training_data
util.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks


问题


面经


文章

微信
公众号

扫码关注公众号