def clean_text(self, txt):
"""
function to clean a text on the basis of configurations mentioned in clean config.
"""
txt = str(txt)
if self.cc['lower']:
txt = txt.lower()
if self.cc['punctuation']:
txt = "".join([x for x in txt if x not in punctuations])
if self.cc['whitespace']:
txt = "".join(txt.split()).strip()
if self.cc['digit']:
txt = "".join(x for x in txt if x not in "0987654321")
return txt
python类punctuation()的实例源码
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def list_returns(fileToCheck, desiredInterface):
returnsList = []
newLine = ""
with open(fileToCheck, 'r') as pyFile:
for line in pyFile:
if line.find("#") == -1:
newFront = line.find("return")
if newFront != -1:
possibleErrorMessageCheck1 = line.find("'")
bracketBefore = line.find("{")
lastBracket = line.find("}")
newLine = line[possibleErrorMessageCheck1:]
possibleErrorMessageCheck2 = newLine.find(" ")
if possibleErrorMessageCheck2 == -1:
line = line[newFront + 7:]
line.split()
line = [word.strip(punctuation) for word in line.split()]
returnsList.extend(line)
elif possibleErrorMessageCheck1 == bracketBefore + 1:
line = line[newFront + 7:lastBracket + 1]
line.split()
returnsList.append(line)
return returnsList
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def make_wifipassword(args):
import random, string, hashlib
if args.password is None:
printable = string.digits + string.letters + string.punctuation
args.password = ''.join([random.choice(printable) for i in xrange(32)])
if args.password_id is None:
args.password_id = random.randint(0x0010, 0xFFFF)
pkhash = hashlib.sha256(args.pubkey.read()).digest()[0:20]
record = nfc.ndef.WifiPasswordRecord()
record.password['public-key-hash'] = pkhash
record.password['password-id'] = args.password_id
record.password['password'] = args.password
message = nfc.ndef.Message(record)
if args.outfile.name == "<stdout>":
args.outfile.write(str(message).encode("hex"))
else:
args.outfile.write(str(message))
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def _insert(self, x, y, text):
""" Insert text at given x, y coordinates --- used with drag-and-drop. """
# Clean text.
import string
text = filter(lambda x: x in (string.letters + string.digits + string.punctuation + ' '), text)
# Find insertion point.
index, flags = self.HitTest((x, y))
if index == wx.NOT_FOUND:
if flags & wx.LIST_HITTEST_NOWHERE:
index = self.GetItemCount()
else:
return
# Get bounding rectangle for the item the user is dropping over.
rect = self.GetItemRect(index)
# If the user is dropping into the lower half of the rect, we want to insert _after_ this item.
if y > rect.y + rect.height/2:
index += 1
self.InsertStringItem(index, text)
def public_posts(self):
now = datetime.now()
# ???????? 30 ????????? ??????? ?? rss ?????? ? ??????? ?? ??, ? ??????? message_id=0
posts_from_db = self.db.get_post_without_message_id()
today_news = [i for i in self.src.news if (
now - datetime.fromtimestamp(i.date)).days < 1]
# ????? ?????????? ???? ???????
for_publishing = list(set(today_news) & set(posts_from_db))
for_publishing = sorted(for_publishing, key=lambda news: news.date)
# for_publishing = sorted(today_news, key=lambda news: news.date)
# ??????? ??????? ?????????
for post in tqdm(for_publishing, desc="Posting news"):
header = base64.b64decode(post.text).decode('utf8')
header = ''.join(c for c in header if c not in set(punctuation + '—«»'))
header = '#' + '_'.join(header.lower().split())
text = '%s %s' % (header,
self.bit_ly.short_link(base64.b64decode(post.link).decode('utf8')))
a = self.send_message(
chat_id=self.chat_id, text=text) # , parse_mode=telegram.ParseMode.HTML)
message_id = a.message_id
chat_id = a['chat']['id']
self.db.update(post.link, chat_id, message_id)
logging.info(u'Public: %s;%s;' %
(str(post), message_id))
time.sleep(self.delay_between_messages)
def rem_whitespace(string):
""" careful to keep this order of patterns or duplicate whitespace created in first round
will not be removed
"""
unwanted_chars = punctuation + whitespace
pat_l = [r'[' + unwanted_chars + ']',
r'\s+',
r' ',
r' \\',
r' \ '
]
for p in pat_l:
rx = re.compile(p)
string = re.sub(rx, ' ', string)
return string.strip()
def get_tag_translate(self, tag):
translate_dict = {
"p": "p",
"punctuation": "",
"heading": "span style='font-style: bold'",
#"heading": "span style='font-style: bold; font-size:150%'",
#"h1": "span style='font-style: bold; font-size:150%'",
"boldface": "b",
"italics": "i",
"underline": "u",
"superscript": "sup",
"subscript": "sup",
"object": "object",
"text": "html"}
if tag in translate_dict:
return translate_dict[tag]
else:
print("unsupported tag: ", tag)
return tag
def add_token(self, token_string, token_pos=None):
# get lemma string:
if all(x in string.punctuation for x in token_string):
token_pos = "PUNCT"
lemma = token_string
else:
try:
# use the current lemmatizer to assign the token to a lemma:
lemma = self._lemmatize(token_string, self._pos_translate(token_pos)).lower()
except Exception:
lemma = token_string.lower()
# get word id, and create new word if necessary:
word_dict = {self.word_lemma: lemma, self.word_label: token_string}
if token_pos and self.arguments.use_nltk:
word_dict[self.word_pos] = token_pos
word_id = self.table(self.word_table).get_or_insert(word_dict, case=True)
# store new token in corpus table:
return self.add_token_to_corpus(
{self.corpus_word_id: word_id,
self.corpus_sentence: self._sentence_id,
self.corpus_file_id: self._file_id})
def normalize_answer(self, s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
text_helpers.py 文件源码
项目:TensorFlow-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def normalize_text(texts, stops):
# Lower case
texts = [x.lower() for x in texts]
# Remove punctuation
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
# Remove numbers
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
# Remove stopwords
texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
# Trim extra whitespace
texts = [' '.join(x.split()) for x in texts]
return(texts)
# Build dictionary of words
word2vec_skipgram.py 文件源码
项目:TensorFlow-Machine-Learning-Cookbook
作者: PacktPublishing
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def normalize_text(texts, stops):
# Lower case
texts = [x.lower() for x in texts]
# Remove punctuation
texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
# Remove numbers
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
# Remove stopwords
texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts]
# Trim extra whitespace
texts = [' '.join(x.split()) for x in texts]
return(texts)
def strip_punctuation(text):
"""
strips the punctuation from a bunch of text
"""
# build a translation table for string.translate:
# there are other ways to do this:
# create a translation table to replace all punctuation with spaces
# -- then split() will remove the extra spaces
punctuation = string.punctuation
punctuation = punctuation.replace("'", "") # keep apostropies
punctuation = punctuation.replace("-", "") # keep hyphenated words
# building a translation table
table = {}
for c in punctuation:
table[ord(c)] = ' '
# remove punctuation with the translation table
text = text.translate(table)
# remove "--" -- can't do multiple characters with translate
text = text.replace("--", " ")
return text
def make_words(text):
"""
make a list of words from a large bunch of text
strips all the punctuation and other stuff from a string
"""
text = strip_punctuation(text)
# lower-case everything to remove that complication:
text = text.lower()
# split into words
words = text.split()
# remove the bare single quotes: "'" is both a quote and an apostrophe
# and capitalize "i"
words2 = []
for word in words:
if word != "'": # remove quote by itself
# "i" by itself should be capitalized
words2.append("I" if word == 'i' else word)
# could be done with list comprehension too -- next week!
# words2 = [("I" if word == 'i' else word) for word in words if word != "'"]
return words2
def _get_base_doge_words(self, eng_text):
"""
Get all base words from text to make doge phrases from.
eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy']
Args:
eng_text (str): Text to get words from.
Returns:
list[str]: List of lower case words to use from text.
"""
phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation])
tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()])
chosen_words = []
for word, tag in tagged_words:
if tag[0] in ['N', 'V', 'J']:
# make noun singular
if tag[0] == 'N':
word = self._lemmatizer.lemmatize(word, pos='n')
# make verb infinitive
elif tag[0] == 'V':
word = self._lemmatizer.lemmatize(word, pos='v')
chosen_words.append(word.encode('ascii', 'ignore')) # lemmatize makes word unicode
return list(set(chosen_words))
def str2index(str_):
# clean white space
str_ = ' '.join(str_.split())
# remove punctuation and make lower case
str_ = str_.translate(None, string.punctuation).lower()
res = []
for ch in str_:
try:
res.append(byte2index[ch])
except KeyError:
# drop OOV
pass
return res
# convert index list to string