def get_binary(self):
return Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
('feat_select', SelectPercentile(percentile=10)),
('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False
)))
])
python类words()的实例源码
def get_sgdc(self):
return Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
('feat_select', SelectPercentile(percentile=10)),
('clf', SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False))
])
def wash(fileList):
# denyPos = ['CC', 'CD', 'DT', 'TO', '']
st = LancasterStemmer()
for f in tqdm(fileList):
fr = open('./washFile/' + f, 'r')
fw = open("./washFile_stem/" + f, 'w')
for line in fr.read().splitlines():
line = remove_punctuation(line).lower()
# wordpos = pos(remove_punctuation(line).lower())
# for turple in wordpos:
# if (turple[0] not in stopwords.words('english')):
# fw.write(turple[0] + ' ')
# fw.write(x + ' ' for x in line.split() if x not in stopwords.words('english'))
# stopw = stopwords.words('english')
words = [x for x in line.split()]
for x in words:
try:
fw.write(st.stem(x) + ' ')
except:
print x
fr.close()
fw.close()
def count_entries(file_list):
"""Performs a count of the number of number of words in the corpus
Args:
file_list (list): list of file names.
Returns:
list: A list of json objects containing the count per file name
"""
result = []
for obj in file_list:
with open(CSV_PATH + obj + '.csv', "r") as entry:
reader = csv.reader(entry, delimiter=",")
col_count = len(reader.next())
res = {"Filename": obj, "Count": col_count}
result.append(res)
return result
def words_to_char_sequence(words_list, tk):
"""Convert words list to chars sequence
# Arguments
words: word list, (sentence_len, word_len)
# Output shape
(sentence_len, MAX_SEQUENCE_LENGTH, MAX_CHAR_PER_WORD)
"""
c_seqs = np.zeros((len(words_list),
TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
for w_i in xrange(len(words_list)):
words = words_list[w_i]
fixed_ws = np.zeros((TrainConfig.MAX_SEQUENCE_LENGTH,
TrainConfig.MAX_CHAR_PER_WORD), dtype='int32')
ws = tk.texts_to_sequences(words)
ws = pad_sequences(ws, maxlen=TrainConfig.MAX_CHAR_PER_WORD)
if TrainConfig.MAX_SEQUENCE_LENGTH < len(words):
max_word_len = TrainConfig.MAX_SEQUENCE_LENGTH
else:
max_word_len = len(words)
fixed_ws[:max_word_len, :] = ws[:max_word_len, :]
c_seqs[w_i] = fixed_ws
return c_seqs
featx.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd[word] += 1
label_word_fd[label][word] += 1
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].items():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.items() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
word2vec_skipgram.py 文件源码
项目:TensorFlow-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def build_dictionary(sentences, vocabulary_size):
# Turn sentences (list of strings) into lists of words
split_sentences = [s.split() for s in sentences]
words = [x for sublist in split_sentences for x in sublist]
# Initialize list of [word, word_count] for each word, starting with unknown
count = [['RARE', -1]]
# Now add most frequent words, limited to the N-most frequent (N=vocabulary size)
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
# Now create the dictionary
word_dict = {}
# For each word, that we want in the dictionary, add it, then make it
# the value of the prior dictionary length
for word, word_count in count:
word_dict[word] = len(word_dict)
return(word_dict)
# Turn text data into lists of integers from dictionary
def load_text_vec(alphabet,filename="",embedding_size = 100):
vectors = {}
with open(filename) as f:
i=0
for line in f:
i+=1
if i % 100000 == 0:
print 'epch %d' % i
items = line.strip().split(' ')
if len(items) == 2:
vocab_size, embedding_size= items[0],items[1]
print ( vocab_size, embedding_size)
else:
word = items[0]
if word in alphabet:
vectors[word] = items[1:]
print 'embedding_size',embedding_size
print 'done'
print 'words found in wor2vec embedding ',len(vectors.keys())
return vectors
def add_list_of_words_in_w2v_model(self, unknown_words):
huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
current_w2v_model_file = open(self.w2v_model_path, "a")
line = huge_w2v_model_file.readline()
unknown_words_left = len(unknown_words)
while line and unknown_words_left:
word = line.split()[0]
if word in unknown_words:
current_w2v_model_file.write(line)
unknown_words = unknown_words - set([word])
unknown_words_left -= 1
line = huge_w2v_model_file.readline()
for word in list(unknown_words):
random_position = random(self.w2v_model.vector_size)*2-1
current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
print "warning random positions introduced for new words ... in the future this should be solved"
current_w2v_model_file.close()
huge_w2v_model_file.close()
def add_list_of_words_in_w2v_model(self, unknown_words):
huge_w2v_model_file = open(self.w2v_huge_model_path, "r")
current_w2v_model_file = open(self.w2v_model_path, "a")
line = huge_w2v_model_file.readline()
unknown_words_left = len(unknown_words)
while line and unknown_words_left:
word = line.split()[0]
if word in unknown_words:
current_w2v_model_file.write(line)
unknown_words = unknown_words - set([word])
unknown_words_left -= 1
line = huge_w2v_model_file.readline()
for word in list(unknown_words):
random_position = random(self.w2v_model.vector_size)*2-1
current_w2v_model_file.write(" ".join(([word]+[str(x) for x in random_position])))
print "warning random positions introduced for new words ... in the future this should be solved"
current_w2v_model_file.close()
huge_w2v_model_file.close()
def extract_NPs(chunk):
"""
Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
we extract the NPs with stopping and location words filtered out, and return list of noun phrases.
"""
forbid_wds = stop_words + location_words
NPs = []
for phrase, ptype in chunk:
if ptype == 'NP':
filtered_wds = []
for wd in phrase.split():
if wd not in forbid_wds:
filtered_wds += [wd]
if len(' '.join(filtered_wds)) > 0:
NPs += [' '.join(filtered_wds)]
return NPs
def extract_NNs(chunk, pos):
"""
Given chunk [(phrase, phrase_type)], e.g., [('the lady', 'NP'), ('with', 'PP'), 'the blue shirt', 'NP'],
and pos [(word, pos)], e.g., [('man', 'NN')]
we extract from NPs with stopping, location, color, size words filtered out,
and return list of NN words only.
"""
forbid_wds = stop_words + location_words + color_words + size_words
NNs = []
for phrase, ptype in chunk:
if ptype == 'NP':
filtered_wds = []
for wd in phrase.split():
wd_pos = [p[1] for p in pos if p[0] == wd][0]
if wd not in forbid_wds and wd_pos != 'JJ' and wd_pos != 'CD': # we don't need JJ nor CD words neither.
filtered_wds += [wd]
if len(' '.join(filtered_wds)) > 0:
NNs += [' '.join(filtered_wds)]
return NNs
def process_text(self, text):
flags = (UNICODE if sys.version < '3' and type(text) is unicode
else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
words = findall(regexp, text, flags)
# remove stopwords
words = [word for word in words]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
# remove numbers
words = [word for word in words if not word.isdigit()]
if self.collocations:
word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
else:
word_counts, _ = process_tokens(words, self.normalize_plurals)
return word_counts
def tokenize(text):
"""
Tokenizes sequences of text and stems the tokens.
:param text: String to tokenize
:return: List with stemmed tokens
"""
tokens = nl.WhitespaceTokenizer().tokenize(text)
tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
tokens = [word for word in tokens if word not in stopwords.words('english')]
tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
stems = []
stemmer = SnowballStemmer("english")
for token in tokens:
token = stemmer.stem(token)
if token != "":
stems.append(token)
return stems
KaggleWord2VecUtility.py 文件源码
项目:word2vec_experiments_kaggle_popcorn
作者: bigsnarfdude
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def review_to_wordlist( review, remove_stopwords=False ):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
#
# 1. Remove HTML
review_text = BeautifulSoup(review).get_text()
#
# 2. Remove non-letters
review_text = re.sub("[^a-zA-Z]"," ", review_text)
#
# 3. Convert words to lower case and split them
words = review_text.lower().split()
#
# 4. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
#
# 5. Return a list of words
return(words)
# Define a function to split a review into parsed sentences
KaggleWord2VecUtility.py 文件源码
项目:word2vec_experiments_kaggle_popcorn
作者: bigsnarfdude
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
# Function to split a review into parsed sentences. Returns a
# list of sentences, where each sentence is a list of words
#
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
#
# 2. Loop over each sentence
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
# Otherwise, call review_to_wordlist to get a list of words
sentences.append( KaggleWord2VecUtility.review_to_wordlist( raw_sentence, \
remove_stopwords ))
#
# Return the list of sentences (each sentence is a list of words,
# so this returns a list of lists
return sentences
def extract_unigram_feats(document, unigrams, handle_negation=False):
"""
Populate a dictionary of unigram features, reflecting the presence/absence in
the document of each of the tokens in `unigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of words/tokens whose presence/absence has to be
checked in `document`.
:param handle_negation: if `handle_negation == True` apply `mark_negation`
method to `document` before checking for unigram presence/absence.
:return: a dictionary of unigram features {unigram : boolean}.
>>> words = ['ice', 'police', 'riot']
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_unigram_feats(document, words).items())
[('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
"""
features = {}
if handle_negation:
document = mark_negation(document)
for word in unigrams:
features['contains({0})'.format(word)] = word in set(document)
return features
def __init__(self,
w=20,
k=10,
similarity_method=BLOCK_COMPARISON,
stopwords=None,
smoothing_method=DEFAULT_SMOOTHING,
smoothing_width=2,
smoothing_rounds=1,
cutoff_policy=HC,
demo_mode=False):
if stopwords is None:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
self.__dict__.update(locals())
del self.__dict__['self']
def from_words(cls, words, window_size=2):
"""Construct a BigramCollocationFinder for all bigrams in the given
sequence. When window_size > 2, count non-contiguous bigrams, in the
style of Church and Hanks's (1990) association ratio.
"""
wfd = FreqDist()
bfd = FreqDist()
if window_size < 2:
raise ValueError("Specify window_size at least 2")
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
wfd[w1] += 1
for w2 in window[1:]:
if w2 is not None:
bfd[(w1, w2)] += 1
return cls(wfd, bfd, window_size=window_size)
def from_words(cls, words, window_size=3):
"""Construct a TrigramCollocationFinder for all trigrams in the given
sequence.
"""
if window_size < 3:
raise ValueError("Specify window_size at least 3")
wfd = FreqDist()
wildfd = FreqDist()
bfd = FreqDist()
tfd = FreqDist()
for window in ngrams(words, window_size, pad_right=True):
w1 = window[0]
if w1 is None:
continue
for w2, w3 in _itertools.combinations(window[1:], 2):
wfd[w1] += 1
if w2 is None:
continue
bfd[(w1, w2)] += 1
if w3 is None:
continue
wildfd[(w1, w3)] += 1
tfd[(w1, w2, w3)] += 1
return cls(wfd, bfd, wildfd, tfd)