python类sent_tokenize()的实例源码

Reader.py 文件源码 项目:scientific-paper-summarisation 作者: EdCo95 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def read_folder(self, folder_name, number_of_files_to_read=10000):
        """
        Reads all files in a directory, splits them into sentences and puts these sentences in a list to return.
        Args:
            folder_name = the name of the folder to read files from
            number_of_files_to_read = optional parameter for how many files in a directory to read
        Returns:
            A list of all sentences from all text files in the folder
        """
        count = 0
        all_sentences = []
        for filename in os.listdir(folder_name):
            if filename.endswith(".txt") and count < number_of_files_to_read:
                main_text_to_open = folder_name + "/" + filename
                main_text = self.open_file_single_string(main_text_to_open)
                udata = main_text.decode("utf-8")
                main_text = udata.encode("ascii", "ignore")
                sentences = sent_tokenize(main_text)
                for sentence in sentences:
                    all_sentences.append(sentence)
            count += 1
        return all_sentences
model.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def create_batch(self, sentence_li):
        """Create a batch for a list of sentences."""

        embeddings_batch = []
        for sen in sentence_li:
            embeddings = []
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                embeddings.append(self.embdict.tok2emb.get(tok))
            if len(tokens) < self.max_sequence_length:
                pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))]
                embeddings = pads + embeddings
            else:
                embeddings = embeddings[-self.max_sequence_length:]
            embeddings = np.asarray(embeddings)
            embeddings_batch.append(embeddings)
        embeddings_batch = np.asarray(embeddings_batch)
        return embeddings_batch
build_dictionary.py 文件源码 项目:samnorsk 作者: gisleyt 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def article_to_pairs(arg):
    article, direction = arg
    pairs = []

    if 'text' not in article:
        return []

    sents = sent_tokenize(article['text'], language='norwegian')
    translations = translate(sents, direction)

    for sent, trans in zip(sents, translations):
        trans_tokens = tokenize(trans)
        tokens = tokenize(sent)

        pairs += compare(tokens, trans_tokens)

    del article
    del sents
    del translations

    return pairs
NewsArticleClass.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def extractFeatures(self, article, n, customStopWords=None):
        # pass in article as a tuple ( text, title)
        text = article[0]
        # extract the text
        title = article[1]
        # extract the title
        sentences = sent_tokenize(text)
        # split text into sentences
        word_sent = [word_tokenize(sentences.lower()) for a in sentences]
        # split sentences into words
        self._freq = self._compute_frequencies(word_sent, customStopWords)
        # calculate word freq using member func created above
        if n < 0:
            # how many features (words) to return - a -ve number means
            # no feature ( word) selection, just return all features
            return nlargest(len(self._freq_keys()),
                            self._freq, key=self._freq.get)
        else:
            # here we say if calling e func has asked for a subset
            # then return only the 'n' largest features, i.e. the
            # most important words ( important == frequent, less stopwords)
            return nlargest(n, self._freq, key=self._freq.get)
NewsArticleClass.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def summarize(self, article, n):
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i, sentence in enumerate(word_sent):
            for word in sentence:
                if word in self._freq:
                    ranking[i] += self._freq[word]
        sentences_index = nlargest(n, ranking, key=ranking.get)
        return [sentences[j] for j in sentences_index]

##############################################################################
# TEST
load_parse.py 文件源码 项目:medknow 作者: kbogas 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def mmap_extract(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!

    Input:
        - text: str,
        a piece of text or sentence
    Output:
        - concepts: list,
        list of metamap concepts extracted
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, range(len(sents)), 
                                         word_sense_disambiguation=True)
    if errors:
        print 'Errors with extracting concepts!'
        print errors
    return concepts
tweet.py 文件源码 项目:SocialNPHS 作者: SocialNPHS 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def person_connotation(tweet, name):
    """
    Decide whether a person is talked favorably about or not, based on the
    tone of the sentences in which their name appears
    """
    twtcontent = sent_tokenize(tweet)
    overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0}
    mentions = 0
    # analyze each sentence talking about `name` person
    for s in twtcontent:
        tags = get_tweet_tags(s)
        # if the name appears in the tagged sentence, get its tone
        if (name, 'NNP') in tags:
            sentence = util.untag(tags)
            scores = tweet_connotation(' '.join(sentence))
            # add it up to the overall tweet's tone
            for i, z in enumerate(scores):
                overall[z] += scores[z]
            mentions += 1
    # averaging all sentences' scores. don't wanna divide by zero now do we
    if mentions != 0:
        for v in overall:
            overall[v] = round(overall[v] / mentions, 3)
    return overall
term_collector.py 文件源码 项目:QProb 作者: quant-trade 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def make_summaries():
    terms = Terms.objects.all()

    removals = ['DEFINITION', 'BREAKING DOWN', 'What is']

    for term in terms:
        try:
            summary = summarizer(term.text, settings.SUMMARIZER_SENTENCES)
            sentence_tokens = sent_tokenize(summary)
            text = ''
            for sentence in sentence_tokens:
                if not any(to_remove in sentence for to_remove in removals):
                    text += "{0} ".format(sentence.replace(r'\A[\d]\S\s', ''))

            term.summary = summarizer(text, settings.SUMMARIZER_SENTENCES)
            term.save()
        except Exception as e:
            print((coloredf.red("[ERROR] Ar terms summarizer: {0}".format(e))))
youtube.py 文件源码 项目:QProb 作者: quant-trade 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def clean_video(video):
    text = []
    try:
        if len(video.description) > 0:
            sentence_tokens = sent_tokenize(video.description)

            for sentence in sentence_tokens:
                if not ('http' in sentence):
                    text.append("{0} ".format(sentence))

        video.description = "".join("{} ".format(s) for s in text)
        video.save()
        if settings.SHOW_DEBUG:
            print(colored.green("Cleaned video description saved to db: {0}".format(video.title)))
    except Exception as e:
        print(colored.red("At clean_video {}".format(e)))
vocabulary.py 文件源码 项目:topicModelling 作者: balikasg 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def doc_to_ids(self, doc, training=True):
        l = []
        words = dict()
        window = 150
#        doc = doc.replace("&ndash;", " ")
#        doc = sent_tokenize(doc)
        for sentence in doc:
            miniArray = []
            for term in sentence:
                id = self.term_to_id(term, training)    
                if id != None:
                    miniArray.append(id)
                    if not id in words:
                        words[id] = 1
                        self.docfreq[id] += 1
            if not len(miniArray):
                continue
            if len(miniArray)  > window:
                l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)])
            else:
                l.append(np.array(miniArray))
        return l
summarizer.py 文件源码 项目:delbot 作者: shaildeliwala 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def summarize(self, text, n):
        """
          Return a list of n sentences
          which represent the summary of text.
        """
        sents = sent_tokenize(text)
        assert n <= len(sents)
        word_sent = [word_tokenize(s.lower()) for s in sents]
        self._freq = self._compute_frequencies(word_sent)
        ranking = defaultdict(int)
        for i,sent in enumerate(word_sent):
            for w in sent:
                if w in self._freq:
                    ranking[i] += self._freq[w]
        sents_idx = self._rank(ranking, n)
        return [sents[j] for j in sents_idx]
data_integration.py 文件源码 项目:LanguageAnalysis 作者: trideeprath 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_xml_language_similarity(file_read,file_write):
    count = 0
    with open(file_read,'r') as f, open(file_write,'w') as out:
        for line in f:
            count +=1
            if count %1000 == 0: print(count)
            if "row Id" in line:
                line = line.strip()
                root = xml.etree.ElementTree.fromstring(line)
                try:
                    body = remove_tags(root.get('Body'))
                    title = remove_tags(root.get('Title'))
                    body_sentences = sent_tokenize(body)
                    title_sentences = sent_tokenize(title)
                    for line in body_sentences:
                        out.write(line+"\n")
                    for line in title_sentences:
                        out.write(line+"\n")
                except:
                    continue
markov2.py 文件源码 项目:markov_bot 作者: 18F 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def train(self, chain_len = None):
        """ Trains the markov data structure by creating chains of desired length """
        if not chain_len:
            chain_len = self.CHAIN_LENGTH

        self.CHAIN_LEN = chain_len

        self.everything['corpus'] = {}
        self.corpus = self.everything['corpus']

        for f in self.everything['input']:
            for line in sent_tokenize( self.everything['input'][f] ):
                words = word_tokenize(line)

                for chain in self._make_chains(words):
                    k = " ".join( chain[:-1] ) # key is everything but last word
                    v = chain[-1] # value is last word

                    try:
                        self.corpus[k].append(v)
                    except:
                        self.corpus[k] = [v]
utils_claimset.py 文件源码 项目:patentdata 作者: benhoyle 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def nltk_extract_claims(text):
    """
    Attempts to extract claims as a list from a large text string.
    Uses nltk sent_tokenize function in tokenize library
    param string text: string containing several claims
    """
    sent_list = sent_tokenize(text)
    # On a test string this returned a list with the claim number
    # and then the claim text as separate items
    claims_list = []
    for i in range(0, len(sent_list), 2):
        try:
            number = int(sent_list[i].split(".")[0])
        except:
            number = 0

        claims_list.append(
            (number, sent_list[i+1])
        )

    return claims_list
core.py 文件源码 项目:qas 作者: kusha 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def check_sentence(text):
        """
        Check, that only one sentence was provided.

        >>> QASystem.check_sentence("Example sentence.")
        >>> QASystem.check_sentence("Example sentence. Another example.")
        Traceback (most recent call last):
        core.MultipleSentences: ['Example sentence.', 'Another example.']

        Args:
            text (str): provided question/answer.

        Returns:
            None

        Raises:
            MultipleSentenceQuestion: in case of more than one sentence inside
            of the text string.
        """
        sent_tokenize_list = sent_tokenize(text)  # nltk tokenize sentence
        if len(sent_tokenize_list) > 1:
            raise MultipleSentences(sent_tokenize_list)
yelp_data_prepare.py 文件源码 项目:NN_sentiment 作者: hx364 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def read_yelp(file_name='yelp_academic_dataset_review.json'):

    f = open(file_name)
    f = f.readlines()
    f = [eval(l.strip()) for l in f]
    stars = [i['stars'] for i in f]
    text = [i['text'] for i in f]

    df = pd.DataFrame()
    df['stars'] = stars
    df['text'] = text

    #compute the number of sentences in each doc
    l = list(df.text)
    text = [sent_tokenize(i) for i in list(df.text)]
    text_len = [len(i) for i in text]

    #2225188 in total
    #2089287 for length<=20
    #1654640 for length<=10
    #We decide to only consider length<=7 here
    df['length'] = text_len
    df['text_split'] = text
    return df
lyrics.py 文件源码 项目:DropMuse 作者: DropMuse 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_sentiment(song):
    scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])

    if not song:
        return scores

    raw_text = song
    raw_text = re.sub("\n", ". ", str(raw_text))

    # Using already trained
    sid = SentimentIntensityAnalyzer()
    sentences = tokenize.sent_tokenize(raw_text)

    scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
    for sentence in sentences:

        ss = sid.polarity_scores(sentence)

        for k in sorted(ss):
            scores[k] += ss[k]

    return scores
extras.py 文件源码 项目:semeval2017-scienceie 作者: UKPLab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def offset_tokenize(text):
    tail = text
    accum = 0
    tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)]
    info_tokens = []
    for tok in tokens:
        scaped_tok = re.escape(tok)
        m = re.search(scaped_tok, tail)
        start, end = m.span()
        # global offsets
        gs = accum + start
        ge = accum + end
        accum += end
        # keep searching in the rest
        tail = tail[end:]
        info_tokens.append((tok, (gs, ge)))
    return info_tokens
corpus_cleaner.py 文件源码 项目:acl2017-interactive_summarizer 作者: UKPLab 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def parse_xml_all(self, data_file, doc_type, language='english'):
        e = ET.parse(data_file)
        cluster_data = {}
        root = e.getroot()
        for topics in root:
            data = []
            topic_id = topics.attrib.get('id')
            for documents in topics.findall(doc_type):
                doc_id = documents.attrib.get('id')
                if doc_type == 'document':
                    title_text = documents.find('title').text
                doc_text = documents.find('text').text
                text = text_normalization(doc_text)
                doc_sents = sent_tokenize(text, language)
                data.append([doc_id, doc_sents])
            cluster_data[topic_id] = data
        return cluster_data
text_helper.py 文件源码 项目:vismooc-data-server 作者: HKUST-VISLab 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def analysis(self, paragraph):
        ''' analysis sentiment given paragraph
        '''
        result = 0
        counter = 0
        sentences = tokenize.sent_tokenize(paragraph)
        for sentence in sentences:
            sentiment = self.analyzer.polarity_scores(sentence)['compound']
            if sentiment > SentimentAnalyzer.neutral_threshold[0] and \
                sentiment < SentimentAnalyzer.neutral_threshold[1]:
                continue

            counter += 1
            result += sentiment

        result = result / float(counter) if counter > 0 else 0
        return result
embeddings_dict.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def add_items(self, sentence_li):
        """Add new items to the tok2emb dictionary from a given text."""

        for sen in sentence_li:
            sent_toks = sent_tokenize(sen)
            word_toks = [word_tokenize(el) for el in sent_toks]
            tokens = [val for sublist in word_toks for val in sublist]
            tokens = [el for el in tokens if el != '']
            for tok in tokens:
                if self.tok2emb.get(tok) is None:
                    self.tok2emb[tok] = self.fasttext_model[tok]
sentiment.py 文件源码 项目:crypto-sentiment 作者: codingupastorm 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_sentiment_from_paragraph(paragraph):
    sentence_list = tokenize.sent_tokenize(paragraph)
    paragraphSentiments = 0.0
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        paragraphSentiments += vs["compound"]
    return round(paragraphSentiments/len(sentence_list), 4)
nazgul.py 文件源码 项目:nazgul 作者: TartuNLP 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def pre_processing(tokenizer, truecaser, info):
    # SPLIT THE WHITESPACES
    source_file_t = re.split('([\t\n\r\f\v]+)', info['src'])

    # SENTENCE TOKENIZE
    for i in range(len(source_file_t)):
        if i % 2 == 0:
            source_file_t[i] = sent_tokenize(source_file_t[i])

    # TOKENIZATION
    if info['tok']:
        for j in range(len(source_file_t)):
            if j % 2 == 0:
                for i in range(len(source_file_t[j])):
                    try:
                        source_file_t[j][i] = str(
                            tokenizer.tokenize(source_file_t[j][i], return_str=True).encode('utf-8'))
                    except NameError:
                        source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))

    # TRUECASING
    if info['tc']:
        for j in range(len(source_file_t)):
            if j % 2 == 0:
                for i in range(len(source_file_t[j])):
                    source_file_t[j][i] = str((truecasing(truecaser, source_file_t[j][i].split(' ')[0]).decode(
                        'utf-8') + " " + (' '.join(source_file_t[j][i].split(' ')[1:]).decode('utf-8'))).encode('utf-8'))
                    print source_file_t[j][i]

    # IF NEITHER
    if not (info['tc'] or info['tok']):
        for j in range(len(source_file_t)):
            if j % 2 == 0:
                for i in range(len(source_file_t[j])):
                    try:
                        source_file_t[j][i] = str(source_file_t[j][i].encode('utf-8'))
                    except NameError:
                        source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.']))

    return source_file_t
NewsArticleClass.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def extractRawFrequencies(self, article):
        # this method is similar to above but returns
        # the raw freq.cies ( all word count)
        text = article[0]
        text = article[1]
        sentences = sent_tokenize(text)
        word_sent = [word_tokenize(s.lower()) for s in sentences]
        freq = defaultdict(int)
        for s in word_sent:
            for word in s:
                if word not in self._stopwords:
                    freq[word] += 1
        return freq
eagle.py 文件源码 项目:stock-eagle 作者: mtusman 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def sentence(text):
    '''Break the text into sentences'''
    return sent_tokenize(text)
generate.py 文件源码 项目:ask_data_science 作者: AngelaVC 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def getSentences(self):
        self.sentences = sent_tokenize(self.text)
data_extractor.py 文件源码 项目:medknow 作者: kbogas 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def metamap_wrapper(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!

    Input:
        - text: str,
        a piece of text or sentence
    Output:
       - a dictionary with key sents and values
       a list of the concepts found
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    # Load Metamap Instance
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, range(len(sents)))
    # Keep the sentence ids
    ids = np.array([int(concept[0]) for concept in concepts])
    sentences = []
    for i in xrange(len(sents)):
        tmp = {'sent_id': i+1, 'entities': [], 'relations': []}
        # Wanted concepts according to sentence
        wanted = np.where(ids == i)[0].tolist()
        for w_ind in wanted:
            w_conc = concepts[w_ind]
            if hasattr(w_conc, 'cui'):
                tmp_conc = {'label': w_conc.preferred_name, 'cui': w_conc.cui, 
                            'sem_types': w_conc.semtypes, 'score': w_conc.score}
                tmp['entities'].append(tmp_conc)
        sentences.append(tmp)
    if errors:
        time_log('Errors with extracting concepts!')
        time_log(errors)
    return {'sents': sentences, 'sent_text':text}
data_extractor.py 文件源码 项目:medknow 作者: kbogas 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def reverb_wrapper(text, stop=None):
    """
    Function-wrapper for ReVerb binary. Extracts relations
    found in text.
    Input:
        - text: str,
        a piece of text or sentence
        - stop: list,
        list of stopwords to remove from the relations
    Output:
        - total: list,
        list of lists. Each inner list contains one relation in the form
        [subject, predicate, object]
    """
    total = []
    for sent in sent_tokenize(text):
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
        reverb_dir = settings['load']['path']['reverb']
        result = runProcess(cmd, reverb_dir)
        # Extract relations from reverb output
        result = result[-3:]
        result = [row.split('\t')[1].strip('\n') for row in result]
        # Remove common stopwords from relations
        if stop:
            result = [stopw_removal(res, stop) for res in result]
        total.append(result)
    # Remove empty relations
    total = [t for t in total if t]
    return total
data_extractor.py 文件源码 项目:medknow 作者: kbogas 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def extract_entities(text, json_={}):
    """
    Extract entities from a given text using metamap and
    generate a json, preserving infro regarding the sentence
    of each entity that was found. For the time being, we preserve
    both concepts and the entities related to them
    Input:
         - text: str,
        a piece of text or sentence
        - json_: dic,
        sometimes the json to be returned is given to us to be enriched
        Defaults to an empty json_
    Output:
        - json_: dic,
        json with fields text, sents, concepts and entities
        containg the final results
    """
    json_['text'] = text
    # Tokenize the text
    sents = sent_tokenize(text)
    json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)]
    json_['concepts'], _ = mmap_extract(text)
    json_['entities'] = {}
    for i, sent in enumerate(json_['sents']):
        ents = metamap_ents(sent)
        json_['entities'][sent['sent_id']] = ents
    return json_
load_parse.py 文件源码 项目:medknow 作者: kbogas 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def reverb_wrapper(text, stop=None):
    """
    Function-wrapper for ReVerb binary. Extracts relations
    found in text.
    Input:
        - text: str,
        a piece of text or sentence
        - stop: list,
        list of stopwords to remove from the relations
    Output:
        - total: list,
        list of lists. Each inner list contains one relation in the form
        [subject, predicate, object]
    """
    total = []
    for sent in sent_tokenize(text):
        cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n"
        reverb_dir = settings['load']['path']['reverb']
        result = runProcess(cmd, reverb_dir)
        # Extract relations from reverb output
        result = result[-3:]
        result = [row.split('\t')[1].strip('\n') for row in result]
        # Remove common stopwords from relations
        if stop:
            result = [stopw_removal(res, stop) for res in result]
        total.append(result)
    # Remove empty relations
    total = [t for t in total if t]
    return total


问题


面经


文章

微信
公众号

扫码关注公众号