python类sent_tokenize()的实例源码-第2页-面圈网

tweet.py 文件源码项目：SocialNPHS 作者: SocialNPHS 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def tweet_connotation(tweet):
    """ Decide whether a tweet is generally positive or negative """
    anlyzr = SentimentIntensityAnalyzer()
    # break tweet up into sentences and analyze each seperately
    twtcontent = sent_tokenize(tweet)
    overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
    for s in twtcontent:
        scores = anlyzr.polarity_scores(s)
        # tally up each sentence's overall tone
        for i, z in enumerate(scores):
            overall[z] += scores[z]
    # average it all together for the tweet as a whole
    for v in overall:
        overall[v] = round(overall[v] / len(twtcontent), 3)
    return overall

opinionTokenizer.py 文件源码项目：scrapyProject 作者: bedcode 项目源码文件源码阅读 16 收藏 0 点赞 0 评论 0

def tokenize_into_opinion_units(text):
    output = []
    for str in sent_tokenize(text):
        for output_str in str.split(' but '):
            output.append(output_str)
    return output

#Take positive.csv and negative.csv and mix them into
#positiveandnegative.csv
#This has each unit tagged with its booking.com sentiment
#This is the data I tagged with Mechanical Turk

Assistant.py 文件源码项目：Personal_AI_Assistant 作者: PratylenClub 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def ask_confirmation(self,best_matching_action):
        alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action])
        alternative_formulation = choice(alternative_formulations)
        self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation)
        answer = self.active_listen()
        if "no" in answer:
            self.speak("Please reformulate your request.")
            return 0
        if "yes" in answer:
            self.speak("Very good")
            return 1

test_eol.py 文件源码项目：Personal_AI_Assistant 作者: PratylenClub 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def ask_confirmation(self,best_matching_action):
        alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action])
        alternative_formulation = choice(alternative_formulations)
        self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation)
        answer = self.active_listen()
        if "no" in answer:
            self.speak("Please reformulate your request.")
            return 0
        if "yes" in answer:
            self.speak("Very good")
            return 1

lang_proc.py 文件源码项目：Search-Engine 作者: SoufianEly 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def stem_and_tokenize_text(text):
    sents = sent_tokenize(text)
    tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
    terms = [Term(token) for token in tokens]
    return filter(lambda term: not term.is_punctuation(), terms)

createtextsumdata.py 文件源码项目：FYP-AutoTextSum 作者: MrRexZ 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def convert_text2bin1(docs, writer):
        global counter
        for i, fi in enumerate(docs):
            with open(os.path.join(curdir,"input","cnn","stories",fi),'r', encoding="UTF-8") as f:
                wholetext=f.read().lower()
                wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
                wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
                wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
                wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
                wholetext=wholetext.replace("."," . ")
                wholetext=wholetext.replace(","," , ")
                wholetext=wholetext.replace('-',' - ')
                wholetext=wholetext.replace('?',' ? ')
                wholetext=wholetext.replace('(','( ')
                wholetext=wholetext.replace(')',' )')
                data=wholetext.split("@highlight")
                news=data[0]
                highlights=data[1].replace('\n\n','')
                news=(" ".join(news.split('\n\n'))).strip()
                sentences = sent_tokenize(news)
                news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
                highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
                words = (news+" "+highlights).split()
                counter.update(words)
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
                tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
                tf_example_str = tf_example.SerializeToString()
                str_len = len(tf_example_str)
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, tf_example_str))
                if i%3000==0:
                    print(int((float(i)/ len(docs))*100), "%")
        print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")

createtextsumdata.py 文件源码项目：FYP-AutoTextSum 作者: MrRexZ 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def convert_text2bin2(docs, writer):
        global counter
        for i, fi in enumerate(docs):
            with open(os.path.join(curdir,"input","dailymail","stories",fi),'r', encoding="UTF-8") as f:
                wholetext=f.read().lower()
                wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
                wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
                wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
                wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
                wholetext=wholetext.replace("."," . ")
                wholetext=wholetext.replace(","," , ")
                wholetext=wholetext.replace('-',' - ')
                wholetext=wholetext.replace('?',' ? ')
                wholetext=wholetext.replace('(','( ')
                wholetext=wholetext.replace(')',' )')
                data=wholetext.split("@highlight")
                news=data[0]
                try:
                    news=news.split("updated:")[1]
                    news=news[news.find('20')+4:]
                except:
                    None
                news=(" ".join(news.split('\n'))).strip()
                highlights=data[1].replace('\n\n','')
                news=(" ".join(news.split('\n\n'))).strip()
                sentences = sent_tokenize(news)
                news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
                highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
                words = (news+" "+highlights).split()
                counter.update(words)
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
                tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
                tf_example_str = tf_example.SerializeToString()
                str_len = len(tf_example_str)
                writer.write(struct.pack('q', str_len))
                writer.write(struct.pack('%ds' % str_len, tf_example_str))
                if i%3000==0:
                    print(int((float(i)/ len(docs))*100), "%")
        print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")

text_tools.py 文件源码项目：QProb 作者: quant-trade 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def text_cleaner(data):
    paragraphs_ = ""
    try:
        keep_endings = ['.', '?']

        removals_ = open(join(settings.BASE_DIR, "aggregator", 'data', 'stop_sentences.txt'), 'r')
        removals = [r.replace('\n', '') for r in removals_]

        if not (data is None):
            text = data.split('\n')
            paragraphs = []
            for p in text:
                if len(p) > settings.MINIMUM_PARAGRAPH:
                    paragraphs.append(p)

            for p in paragraphs:
                sentence_tokens = sent_tokenize(p)
                paragraph = ""
                for sentence in sentence_tokens:
                    if sentence[-1] in keep_endings:
                            if len(sentence) > settings.MINIMUM_SENTENCE:
                                #should remove most of the code:
                                if sentence[0].isupper():
                                    if not any(to_remove in sentence for to_remove in removals):
                                        #eliminate some bad ending strings:
                                        if not sentence.endswith(('e.g.', 'i.e.')):
                                            paragraph += "{0} ".format(sentence)
                paragraphs_ +=  "<p>{0}</p>".format(paragraph)
    except Exception as e:
        print(colored.red("At text_cleaner {}".format(e)))

    return paragraphs_

mscc_text_tokenize.py 文件源码项目：context2vec 作者: orenmel 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def write_paragraph_lines(paragraph_lines):
    paragraph_str = ' '.join(paragraph_lines)
    for sent in sent_tokenize(paragraph_str):
        if lowercase:
            sent = sent.lower()
        output_file.write(' '.join(word_tokenize(sent))+'\n')

dataset_reader.py 文件源码项目：context2vec 作者: orenmel 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def extract_target_context(self, paragraph, isolate_target_sentence):

        if isolate_target_sentence:
            for sent in sent_tokenize(paragraph):
                words, position = self.extract_context(sent)
                if words is not None:
                    break
        else:
            words, position = self.extract_context(paragraph)
        return words, position

vocabulary_sentenceLayer.py 文件源码项目：topicModelling 作者: balikasg 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        doc_sents = sent_tokenize(doc)
        for sentence in doc_sents:
            miniArray = []
            for term in sentence.split():
                id = self.term_to_id(term, training)
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq()
            l.append(np.array(miniArray, dtype=np.int32))
        return l

sentences.py 文件源码项目：textkit 作者: learntextvis 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def text2sentences(text):
    '''Tokenize text into sentence tokens.'''
    content = '\n'.join([open(f).read() for f in text])
    sentences = []
    try:
        sentences = sent_tokenize(content)
    except LookupError as err:
        click.echo(message="Error with tokenization", nl=True)
        click.echo(message="Have you run \"textkit download\"?", nl=True)
        click.echo(message="\nOriginal Error:", nl=True)
        click.echo(err)
    [output(s.strip()) for s in sentences]

popular_phrases.py 文件源码项目：markov_bot 作者: 18F 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def make_phrases(self, start = 1, end = None):
        if not end: end = start + 1

        for chain_len in range(start, end): # +1 because of the way range works
            self.phrases[chain_len] = []

            for f in self.everything['input']:
                for line in sent_tokenize( self.everything['input'][f] ):
                    words = word_tokenize(line)

                    for chain in self._make_chains(words, chain_len):

                        try:
#                           print "ERROR.0:", chain
                            chain = chain[:-1] # drop last item in chain as it's "value" for markov
                            chain = [c for c in chain if c is not None] # quick clean as None is breaking join
                        except: 
                            print "ERROR.1:", chain
#                           sys.exit(-1)

#                       print chain_len, " => ", chain

                        try:
                            self.phrases[chain_len].append(" ".join(chain) )
                        except:
                            print "ERROR.2:", chain
                            sys.exit(-1)

            return Counter( self.phrases[chain_len] )

watch.py 文件源码项目：Stockeye 作者: anfederico 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def buildGraph(text):
    vertices = [] 
    sentences = sent_tokenize(text, language='english')
    for sentence_raw in sentences:  
        sentence_processed = sub("[^a-zA-Z ]+", '', sentence_raw).lower()          
        words = word_tokenize(sentence_processed, language='english')
        vertices.append(vertex(sentence_raw, sentence_processed, words))

    for v1 in vertices:
        for v2 in vertices:
            if v1.order != v2.order:                
                v1.scores.append(overlap(v1.words, v2.words))
        v1.averageScores()
    return vertices

sentimentAnalysis.py 文件源码项目：twitter-sentiment 作者: words-sdsc 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def updateSentiment(dbLoc, tableName):
    sid = SentimentIntensityAnalyzer()
    conn = sqlite3.connect(dbLoc)

    cursor = conn.execute("SELECT * from %s" % tableName)

    # Go through every sentence
    for row in cursor:
        text = cleanTweet(row[TWEET_INDEX])
        #blob = TextBlob(text)

        sent = 0.0
        count = 0
        sentList = tokenize.sent_tokenize(text)

        # Go through each sentence in tweet
        for sentence in sentList:
            count += 1
            ss = sid.polarity_scores(sentence)
            sent += ss['compound']  # Tally up the overall sentiment

        if count != 0:
            sent = float(sent / count)

        # Update into DB
        conn.execute("UPDATE " + tableName + " set SENTIMENT = ? where ID = ?", \
                (sent, row[ID_INDEX]))


    conn.commit()
    conn.close()

sentimentAnalysis.py 文件源码项目：twitter-sentiment 作者: words-sdsc 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def getSentiment(tweet):
    sid = SentimentIntensityAnalyzer()
    tweet = cleanTweet(tweet)
    sent = 0.0
    count = 0
    sentList = tokenize.sent_tokenize(tweet)

    # Go through each sentence in tweet
    for sentence in sentList:
        count += 1
        ss = sid.polarity_scores(sentence)
        sent += ss['compound']  # Tally up the overall sentiment

    if count != 0:
        sent = float(sent / count)

    return sent

# Update the sentiment

summarizer.py 文件源码项目：PySummarizer 作者: musikalkemist 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def _preprocess(self, text):
        """ Return a list of lists. Each list is a preprocessed sentence of 
            text in bag-of-words format."""

        stemmer = PorterStemmer()
        self._sents = sent_tokenize(text)
        # tokenize sentences
        word_sents = [word_tokenize(sent.lower()) for sent in self._sents]
        # remove stop-words and stem words
        word_sents = [[stemmer.stem(word) for word in sent if 
            word not in self._stopwords] for sent in word_sents]
        return word_sents

utilities.py 文件源码项目：BioNLP-2016 作者: cambridgeltl 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def text_to_sentences(self, text, tokenizer, remove_stopwords=False ):
        print "text_to_sentence"
        #from nltk.tokenize import wordpunct_tokenize
        # Function to split a review into parsed sentences. Returns a 
        # list of sentences, where each sentence is a list of words
        #
        text=text.decode("utf8")
        from nltk.tokenize import sent_tokenize,wordpunct_tokenize
        # 1. Use the NLTK tokenizer to split the paragraph into sentences
        #raw_sentences = tokenizer.tokenize(text.strip())
        raw_sentences = sent_tokenize(text.strip())
        print "finish tokenize sentence",len(raw_sentences)
        #
        # 2. Loop over each sentence
        sentences = []
        for raw_sentence in raw_sentences:

            #print "sentence:",raw_sentence
            # If a sentence is empty, skip it
            if len(raw_sentence) > 0:
                # Otherwise, call review_to_wordlist to get a list of words
                #sentences.append( text_to_wordlist( raw_sentence, \
    #               remove_stopwords ))
                #print removePunctuation(raw_sentence).lower().split()
                print raw_sentence
                sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split())
                print wordpunct_tokenize(raw_sentence)
                #print  text_to_wordlist( raw_sentence, remove_stopwords )
        #    
        # Return the list of sentences (each sentence is a list of words,
        # so this returns a list of lists
        return sentences

demo.py 文件源码项目：e2e-coref 作者: kentonl 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def create_example(text):
  raw_sentences = sent_tokenize(text)
  sentences = [word_tokenize(s) for s in raw_sentences]
  speakers = [["" for _ in sentence] for sentence in sentences]
  return {
    "doc_key": "nw",
    "clusters": [],
    "sentences": sentences,
    "speakers": speakers,
  }

program.py 文件源码项目：django-summarizer 作者: zsharique 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def getSentences(paragraph):
    """
    Extracts sentences from a paragraph
    :param paragraph: (str) paragraph text
    :returns: list of sentences
    """
    indexed = {}
    i = 0
    sentenceList = tokenize.sent_tokenize(paragraph)
    for s in sentenceList:
        indexed[i] = s
        i += 1
    return sentenceList, indexed