python类cut()的实例源码-面圈网

process_corpus.py 文件源码项目：question-classification-cnn-rnn-attention 作者: sefira 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)

process_corpus.py 文件源码项目：question-classification-cnn-rnn-attention 作者: sefira 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def WordBeark():
    logger.info("running Word Beark in " + path + data)

    inputfile = path + data + ".zhs"
    outputfile = path + data + ".wordbreak"
    i = 0
    output = open(outputfile, 'w')
    input = open(inputfile, 'r')

    for line in input.readlines():
        seg_list = jieba.cut(line)
        output.write(u' '.join(seg_list))

        i = i + 1
        if (i % 10000 == 0):
            logger.info("Cut " + str(i) + " articles")

    output.close()
    logger.info("Finished Saved " + str(i) + " articles in " + outputfile)

qaData.py 文件源码项目：QA 作者: S-H-Y-GitHub 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def sentenceToIndex(sentence, word2idx, maxLen):
    """
    ??????????embeddings??????

    :param sentence: ??
    :param word2idx: ?????
    :param maxLen: ???????
    :return: ??????????
    """
    unknown = word2idx.get("UNKNOWN", 0)
    num = word2idx.get("NUM", len(word2idx))
    index = [unknown] * maxLen
    i = 0
    for word in jieba.cut(sentence):
        if word in word2idx:
            index[i] = word2idx[word]
        else:
            if re.match("\d+", word):
                index[i] = num
            else:
                index[i] = unknown
        if i >= maxLen - 1:
            break
        i += 1
    return index

BM25.py 文件源码项目：QAServer 作者: fssqawj 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def bm25(p, titles, answers, scores):
    original_titles = copy.deepcopy(titles)
    titles = [remove_punctuation_re(title) for title in titles]
    answers = [remove_punctuation_re(answer) for answer in answers]
    p = remove_punctuation_re(p)
    titles = [' '.join(jieba.cut(title)) for title in titles]
    p = ' '.join(jieba.cut(p))
    wordindoc, wordindata, doclen, sumlen = init(titles, False)
    global avglen
    avglen = 1.0 * sumlen / N
    res = search(p, zip(titles, original_titles, answers, scores), wordindoc, wordindata, doclen)
    titles, answers, scores = [], [], []
    for key, _ in res:
        titles.append(key[0])
        answers.append(key[1])
        scores.append(key[2])
    return titles, answers, scores

gctag.py 文件源码项目：gctag 作者: Fenghuapiao 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_html_text(url):
    response = requests.get(url)
    origin_text = response.text
    origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I | re.M | re.DOTALL)
    origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I | re.M | re.DOTALL)

    doc = html.fromstring(origin_text)
    text = doc.xpath('//body//text()')
    text = [i.strip() for i in text if i.strip()]
    text = ' '.join(text)
    seg = jieba.cut(text)

    stopwords = read_stopwords('./utils/stopwords.txt') # callable read_stopwords()
    seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit()
           and i.strip() not in stopwords]
    seg = ' '.join(seg)

    return seg

helper.py 文件源码项目：tensorflow-deep-qa 作者: shuishen112 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def overlap_index(question,answer,q_len,a_len,stopwords = []):
    qset = set(cut(question))
    aset = set(cut(answer))

    q_index = np.zeros(q_len)
    a_index = np.zeros(a_len)

    overlap = qset.intersection(aset)
    for i,q in enumerate(cut(question)[:q_len]):
        value = 1
        if q in overlap:
            value = 2
        q_index[i] = value
    for i,a in enumerate(cut(answer)[:a_len]):
        value = 1
        if a in overlap:
            value = 2
        a_index[i] = value
    return q_index,a_index

data_helpers.py 文件源码项目：tensorflow-deep-qa 作者: shuishen112 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def overlap_index(question,answer,q_len,a_len,stopwords = []):
    qset = set(cut(question))
    aset = set(cut(answer))

    q_index = np.zeros(q_len)
    a_index = np.zeros(a_len)

    overlap = qset.intersection(aset)
    for i,q in enumerate(cut(question)[:q_len]):
        value = 1
        if q in overlap:
            value = 2
        q_index[i] = value
    for i,a in enumerate(cut(answer)[:a_len]):
        value = 1
        if a in overlap:
            value = 2
        a_index[i] = value
    return q_index,a_index

data_helpers.py 文件源码项目：tensorflow-deep-qa 作者: shuishen112 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def ma_overlap_zi(row):
    question = cut(row["question"])
    answer = cut(row["answer"])

    di_question = []
    di_answer = []
    for w in question:
        for i in range(len(w) ):
            di_question.append(w[i])
    for w in answer:

        for i in range(len(w) ):
            di_answer.append(w[i])

    di_overlap = set(di_question).intersection(set(di_answer) )

    di_weight_p = dict({})
    for k in range(len(di_question) ):
        if di_question[k] in di_overlap:
            # print int(100*((k+1)/(len(question)+1)) )
            di_weight_p[di_question[k] ] =((k+1)/len(di_question))**3.2# zi_weight[ int(100*((k+1)/(len(di_question)+1)) )]#((k+1)/len(di_question))**3.2
    di_weight_all = 0.0
    for k in di_overlap:
        di_weight_all += di_weight_p[k]
    return di_weight_all /(len(di_answer)+40)

updateidf.py 文件源码项目：HtmlExtract-Python 作者: xinyi-spark 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_word_count(filename):
    data_source=open(filename,'r')
    data=data_source.read()
    if(data!=''):
        temp_result = jieba.cut(data,cut_all=True)
        temp_result = '/'.join(temp_result)
        word_result=temp_result.split('/')
        word_view={}#word_view[i]?????????????????i?
        for i in word_result:
            word_view[i]=0
            if(i not in word_doc):
                word_doc[i]=0
        for i in word_result:
            if(word_view[i]==0):
                word_view[i]=1;
                word_doc[i]=word_doc[i]+1

ptt_filter.py 文件源码项目：JustCopy 作者: exe1023 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def print2file(f, title, responses, marker = '', separater = True):
    if marker != '':
        f.write(marker + ' ')
    title_cutted = jieba.cut(title.strip(), cut_all=False)
    for word in title_cutted:
        f.write(word + ' ')
    f.write('\n')
    for response in responses:
        #print(response['Content'])
        #if response['Content'] not in count_response.keys():
        #    count_response[response['Content']] = 0
        #count_response[response['Content']] += 1
        if marker != '':
            f.write(marker + ' ')
        response_cutted = jieba.cut(response['Content'].strip(), cut_all=False)
        for word in response_cutted:
            f.write(word + ' ')
        f.write('\n')
    if separater:
        f.write('===\n')

reviews_preprocessing.py 文件源码项目：Stock-SentimentAnalysis 作者: JoshuaMichaelKing 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def word_tokenization(tick_blog_list):
    '''
    word tokenization by jieba to list
    return list : [[,], [,], ...]
    '''
    count = 0
    seg_list = []
    try:
        for blog in tick_blog_list:
            count += 1
            if blog != '':
                segments = jieba.cut(blog)
                tmp = []
                for seg in segments:
                    tmp.append(seg)
                seg_list.append(tmp)
            else:
                print('Line%d is empty!' % cnt)
    except IOError as e:
        logging.error('IOError %s' % e)
    finally:
        return seg_list

#-------------------------------------------------------------------------------

sentiment.py 文件源码项目：Stock-SentimentAnalysis 作者: JoshuaMichaelKing 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def word_tokenization(tick_blog_list):
    '''
    word tokenization by jieba to list
    return list : [[,], [,], ...]
    '''
    count = 0
    seg_list = []
    try:
        for blog in tick_blog_list:
            if blog != '':
                count += 1
                segments = jieba.cut(blog)
                tmp = []
                for seg in segments:
                    tmp.append(seg)
                seg_list.append(tmp)
    except IOError as e:
        logging.error('IOError %s' % e)
    finally:
        return seg_list

# Python????????

word_segment.py 文件源码项目：finance_news_analysis 作者: pskun 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def word_segment(line, stop=False, remain_number=True):
    '''
    ???????
    stop ??????
    '''
    if STOP_WORDS is None:
        load_stopwords()
    seg_list = jieba.cut(line, HMM=True)
    sl = []
    for word in seg_list:
        word = word.strip()
        if len(word) > 0 and word not in PUNCT:
            if stop:
                if word in STOP_WORDS:
                    word = None
            if word is not None and not remain_number:
                if util_func.atof(word) is not None:
                    word = None
            if word is not None:
                sl.append(word)
    return sl

cut_text.py 文件源码项目：internet-content-detection 作者: liubo0621 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def cut_for_property(self, text):
        '''
        @summary: ??????
        ---------
        @param text: ????
        ---------
        @result: ??[(text1, property1)...(textN, propertyN)]
        '''
        words_list = []

        words =pseg.cut(text)
        for word in words:
            if word.word not in self._stop_words:
                words_list.append((word.word, word.flag))

        return words_list

data_utils.py 文件源码项目：ChineseNER 作者: zjy-ucas 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def get_seg_features(string):
    """
    Segment text with jieba
    features are represented in bies format
    s donates single word
    """
    seg_feature = []

    for word in jieba.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            seg_feature.extend(tmp)
    return seg_feature

mywordCloud.py 文件源码项目：warWolf 作者: wu-yy 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_all_keywords(file_name):
    word_lists=[]  #?????
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists=f.readlines()
        for li in Lists:
            cut_list=list(jieba.cut(li))
            for word in cut_list:
                word_lists.append(word)

    word_lists_set=set(word_lists)  #???????
    sort_count=[]
    word_lists_set=list(word_lists_set)

    length=len(word_lists_set)
    print(u'??%d????'%length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n")
        print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?")
        k += 1
    with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
        f.writelines(sort_count)

mywordCloud.py 文件源码项目：warWolf 作者: wu-yy 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def get_all_keywords(file_name):
    word_lists=[]  #?????
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists=f.readlines()
        for li in Lists:
            cut_list=list(jieba.cut(li))
            for word in cut_list:
                word_lists.append(word)

    word_lists_set=set(word_lists)  #???????
    sort_count=[]
    word_lists_set=list(word_lists_set)

    length=len(word_lists_set)
    print(u'??%d????'%length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w + u':' + str(word_lists.count(w)) + u"?\n")
        print(u"%d---" % k + w + u":" + str(word_lists.count(w)) + u"?")
        k += 1
    with codecs.open('count_word.txt', 'w', encoding='utf-8') as f:
        f.writelines(sort_count)

datahandle.py 文件源码项目：Graduation-design 作者: Baichenjia 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def Delete_stopwords():
    print '????????...'
    f_stop = open('emotion_file/stopwords.txt')  # ???????
    f_stop_list = []
    for word in f_stop.readlines():
        f_stop_list.append(word)
    f_stop.close()

    f_text = open("emotion_file/data_zhuguan.txt", "r")   # ????
    f_nostop = codecs.open('emotion_file/data_zhuguan_nostop.txt', 'w', encoding='UTF-8')
    for text in f_text.readlines():  # ??????????????
        f_seg_list = list(jieba.cut(text, cut_all=False))  # ????
        for word in f_seg_list:
            if word in f_stop_list:
                print word
            else:
                f_nostop.write(word)
    f_text.close()
    print"???????..."  # ????


# ??????????????? data_jixing.txt ??????????

data_utils.py 文件源码项目：LSTM-CRF-For-Named-Entity-Recognition 作者: zpppy 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def get_seg_features(string):
    """
    Segment text with jieba
    features are represented in bies format
    s donates single word
    """
    seg_feature = []

    for word in jieba.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            ## ??????extend????append
            seg_feature.extend(tmp)
    return seg_feature

ContendSplit.py 文件源码项目：SentimentAnalysis-chinese-master 作者: Chenalong 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def jieba_contend_split(contend):
    punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\'']
    wordSequenceList = []  # ???? [[(id,comtend),()....]] ???????????????????
    seg_list = jieba.cut(self.commentSentence)
    segmentedComment = [item for item in seg_list]
    segmentedCommentTuple = list(enumerate(segmentedComment))
    subWordSequenceList = []
    for wordTuple in segmentedCommentTuple:
        if wordTuple[1] in punctuation:
            if subWordSequenceList:
                wordSequenceList.append(subWordSequenceList)
                subWordSequenceList = []
        else:
            subWordSequenceList.append(wordTuple)
    if subWordSequenceList:
        wordSequenceList.append(subWordSequenceList)
    return wordSequenceList

SentiAnalysis.py 文件源码项目：SentimentAnalysis-chinese-master 作者: Chenalong 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def segByPunc(self):
        punctuation = [u'?', u'/', u'?', u'?', u'?', u' ', u'\'']
        wordSequenceList = []  #???? [[(id,comtend),()....]] ???????????????????
        seg_list = jieba.cut(self.commentSentence)
        segmentedComment = [item for item in seg_list]
        segmentedCommentTuple = list(enumerate(segmentedComment))
        subWordSequenceList = []
        for wordTuple in segmentedCommentTuple:
            if (wordTuple[1] in punctuation):
                if (subWordSequenceList != []):
                    wordSequenceList.append(subWordSequenceList)
                    subWordSequenceList = []
            else:
                subWordSequenceList.append(wordTuple)
        if (subWordSequenceList != []):
            wordSequenceList.append(subWordSequenceList)
        return (wordSequenceList)

    #?????????????????????????

AsianNLP.py 文件源码项目：scattertext 作者: JasonKessler 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def _asian_tokenization(doc, entity_type, tag_type, tokenizer):
    sents = []
    for paragraph in doc.split('\n'):
        sent_splits = iter(re.split(r'(?|?|?|?)+', paragraph, flags=re.MULTILINE))
        for partial_sent in sent_splits:
            sent = partial_sent + next(sent_splits, '')
            if sent.strip() == '': continue
            toks = []
            # for tok in jieba.cut(sent, ):
            for tok in tokenizer(sent):
                pos = 'WORD'
                if tok.strip() == '':
                    pos = 'SPACE'
                elif punct_re.match(tok):
                    pos = 'PUNCT'
                toks.append(Tok(pos,
                                tok[:2].lower(),
                                tok.lower(),
                                tok,
                                ent_type='' if entity_type is None else entity_type.get(tok, ''),
                                tag='' if tag_type is None else tag_type.get(tok, '')))
            sents.append(Sentence(toks, sent))
    return Doc(sents, doc)

get_data2.py 文件源码项目：text_analysis 作者: mathlf2015 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_result(url_set):
    line_set = []
    for url in url_set:
        wb_data = requests.get(url,headers = headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        a = soup.select('span.ctt')
        for i in range(len(a)):
            text = re.sub('<[^>]*>', '',a[i].text)
            text = re.sub('??', ' ', text)
            text = re.sub('[\W]+', ' ', text)
            line_set.append(text)
            #print(text)
            #writer.writerow((i,text))
    word_list = [" ".join(jieba.cut(sentence)) for sentence in line_set]
    new_text = ' '.join(word_list)
    wordcloud = WordCloud(font_path="C:/Python34/Lib/site-packages/wordcloud/simhei.ttf", background_color="black").generate(new_text)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

data_helpers.py 文件源码项目：CNNChineseClassifyer 作者: winnerineast 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def load_utf8_data_and_labels(positive_data_file, negative_data_file):
    # Load data from files
    positive_data = list(codecs.open(positive_data_file, "r", encoding='utf-8').readlines())
    positive_examples = list()
    for s in positive_data:
        positive_examples.append(" ".join(jieba.cut(s)))

    negative_data = list(codecs.open(negative_data_file, "r", encoding='utf-8').readlines())
    negative_examples = list()
    for s in negative_data:
        negative_examples.append(" ".join(jieba.cut(s)))

    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]

    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

SVMClass.py 文件源码项目：Chinese-ChatBot-AIML-Web.py 作者: JingLuo05 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def test(self, input_str):
        '''
        4?????????????????SVM??????????????
        '''
        test_input = input_str
        x_test = np.zeros(self.count+1)                    #???????
        after_split = " ".join(jieba.cut(test_input))  #??
        words = after_split.split(" ")
        for i in words:
            i = i.replace('\n','')
            i = i.replace('\r','')
            i = i.replace(' ','')
            if self.dictionary.__contains__(i.encode('utf-8')):
                x_test[self.dictionary[i.encode('utf-8')]] = 1.
            # else:
            #     print 'Cannot find: '+i

        #???0????1
        if self.mySVM.predict([x_test]) == 1.:
            return 1
        else:
            return 0

job_spider.py 文件源码项目：51job 作者: chenjiandongx 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def post_desc_counter():
        """ ??????
        """
        # import thulac
        post = open(os.path.join("data", "post_require.txt"),
                    "r", encoding="utf-8").read()
        # ?? thulac ??
        # thu = thulac.thulac(seg_only=True)
        # thu.cut(post, text=True)

        # ?? jieba ??
        file_path = os.path.join("data", "user_dict.txt")
        jieba.load_userdict(file_path)
        seg_list = jieba.cut(post, cut_all=False)
        counter = dict()
        for seg in seg_list:
            counter[seg] = counter.get(seg, 1) + 1
        counter_sort = sorted(
            counter.items(), key=lambda value: value[1], reverse=True)
        pprint(counter_sort)
        with open(os.path.join("data", "post_pre_desc_counter.csv"),
                  "w+", encoding="utf-8") as f:
            f_csv = csv.writer(f)
            f_csv.writerows(counter_sort)

run.py 文件源码项目：TiebaTool 作者: ZRStea 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def calculate_similarity(text1,text2):
    raw1 = jieba.cut(text1)
    raw2 = jieba.cut(text2)
    raw1 = Counter(raw1)
    raw2 = Counter(raw2)
    same_words = set(raw1) & set(raw2)
    if (math.sqrt(len(raw1)) * math.sqrt(len(raw2))) != 0:
        dot_product = 0
        mod1 = 0
        mod2 = 0
        for word in same_words:
            dot_product += raw1[word] * raw2[word]
        for word in raw1:
            mod1 += math.pow(raw1[word],2)
        for word in raw2:
            mod2 += math.pow(raw2[word],2)
        cos = dot_product/math.sqrt(mod1*mod2)
    else:
        cos = 0
    return cos

__init__.py 文件源码项目：jieba 作者: isuhao 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def extract_tags(sentence,topK=20):
    words = jieba.cut(sentence)
    freq = {}
    for w in words:
        if len(w.strip())<2: continue
        if w.lower() in stop_words: continue
        freq[w]=freq.get(w,0.0)+1.0
    total = sum(freq.values())
    freq = [(k,v/total) for k,v in freq.iteritems()]

    tf_idf_list = [(v * idf_freq.get(k,median_idf),k) for k,v in freq]
    st_list = sorted(tf_idf_list,reverse=True)

    top_tuples= st_list[:topK]
    tags = [a[1] for a in top_tuples]
    return tags

cut.py 文件源码项目：Chinese_text_classifier 作者: swordLong 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def cut_Text(content, nomial=False):
    """
    :param content: string
    :param nomial: if nomial is True,only noun-like words will remain
    :return:a text which format is 'a   b   c   d'
    """
    if nomial:
        text = ''
        words = pseg.cut(content)
        for word in words:
            if contain(['n'], word.flag):
                text = text + ' ' + word.word
        return text.strip()
    else:
        text = ''
        words = jieba.cut(content)
        for word in words:
            text = text + ' ' + word
        return text.strip()

cut.py 文件源码项目：Chinese_text_classifier 作者: swordLong 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def cut_Dataset(data_set, parrel=False, nomial=False):
    """
    :param data_set:bunch of Dataset
    :param parrel: if it is True,cut dataset in parrel.Windows is not available
    :param nomial: if nomial is True,only noun-like words will remain
    :return:data_set after cutted
    """
    from tqdm import tqdm
    data_cut = []
    start = time.time()
    print('cuting dataset......')
    if parrel:
        p = ThreadPool(9)
        p.map(cut_Text, data_set.data)
        p.close()
        p.join()
    else:
        n=0
        for doc_content in tqdm(data_set.data):
            data_cut.append(cut_Text(doc_content, nomial))
    end = time.time()
    print('cuting  runs %0.2f seconds.' % (end - start))
    data_set.data = data_cut