def copy_packages(self):
import nltk.data
target_path = nltk.data.path[0]
for x in [comp for comp in self._missing if "/" in comp]:
parts = x.split("/")
subdir = os.path.join(target_path, parts[0])
package = parts[1]
zip_name = "{}.zip".format(package)
self.updateLabel.emit(package)
src = os.path.join(_NLTK_dir, zip_name)
dst = os.path.join(subdir, zip_name)
if not os.path.exists(subdir):
os.makedirs(subdir)
if os.path.exists(src):
shutil.copyfile(src, dst)
else:
raise ValueError("Package file {}.zip not found in {}".format(package, _NLTK_dir))
with zipfile.ZipFile(dst) as zipped:
for member in zipped.infolist():
zipped.extract(member, subdir)
self.progressTheBar.emit()
python类data()的实例源码
def batch_iter(data, batch_size, num_epochs, shuffle=True):
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
def read_json_file(path_to_json):
objects = []
data = ''
with io.open(path_to_json, 'r', encoding='utf8') as f:
for line in f:
if line in ['\n', '\n\r']:
objects.append(json.loads(data))
data = ''
else:
data += line
try:
objects.append(json.loads(data))
except:
return objects
return objects
# get original sentence, compression sentence
def __init__(self, root, fileids=DOC_PATTERN, tags=None,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=nltk.data.LazyLoader(
'tokenizers/punkt/english.pickle'),
encoding='utf8', **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._good_tags = tags or self.TAGS
def docs(self, fileids=None, categories=None):
"""
Returns the complete JSON document for every file in the corpus.
Note that I attempted to use the nltk ``CorpusView`` and ``concat``
methods here, but was not getting memory safe iteration. Instead the
simple Python generator by far did a better job of ensuring that file
handles got closed and that not all data was loaded into memory at a
time. In the future, I will try to re-implement the corpus view.
"""
# Resolve the fileids and the categories
fileids = self._resolve(fileids, categories)
# Create a generator, loading one document into memory at a time.
for path, enc, fileid in self.abspaths(fileids, True, True):
with codecs.open(path, 'r', encoding=enc) as f:
yield json.load(f)
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def load_data_and_labels():
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
#x_text = list(open("./trainUNK.txt", "r").readlines())
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
x_text = [s.split(" ") for s in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]
def load_data_for_books(path):
text = ''.join(open(path).readlines()).decode('utf8')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
book = tokenizer.tokenize(text)
#book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
#book = list(open(path, "r").readlines())
book = [s.strip() for s in book]
book = [clean_str(sent) for sent in book]
book = [s.split(" ") for s in book]
x_text = book
y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
sentences, labels = x_text,y
sentences_padded = pad_sentences(sentences)
sentencesT, labelsT = load_data_and_labels()
sentences_paddedT = pad_sentences(sentencesT)
vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
x, y = build_input_data(sentences_padded, labels, vocabulary)
return [x, y, vocabulary, vocabulary_inv, sentencesT]
chat80.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 37
收藏 0
点赞 0
评论 0
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def sql_query(dbname, query):
"""
Execute an SQL query over a database.
:param dbname: filename of persistent store
:type schema: str
:param query: SQL query
:type rel_name: str
"""
import sqlite3
try:
path = nltk.data.find(dbname)
connection = sqlite3.connect(str(path))
cur = connection.cursor()
return cur.execute(query)
except (ValueError, sqlite3.OperationalError):
import warnings
warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
raise
def add_full_stops_to_the_end(infile, outfile):
#clean data of small titles nad add full stops for NLTK to work
output_format = '{}.\n'.format
with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
for line in fin:
if line[0] == ' ':
pass
#ignore headlines with less than three words
elif len(line.split()) <= 3:
pass
elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
print >> fout, line.decode('utf-8'),
else:
print >> fout, output_format(line.strip()).decode('utf-8'),
############################################
# Convert All except first word and quotes
# to lower case #
############################################
def location(url):
fdata={'Accept':'*/*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Content-Length':'29',
'Content-type':'application/x-www-form-urlencoded',
'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
'Host':'get-site-ip.com',
'Origin':'http://get-site-ip.com',
'Referer':'http://get-site-ip.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url})
#print response.content
soup=BeautifulSoup(response.content,"lxml")
#print "Location : "
for i in soup.find_all("div", { "class" :"response"}):
# print i.get_text()
# print i.get_text().split('-')[2].replace(' ','')
return i.get_text().split('-')[2].replace(' ','')
#Finds number of special characters
def nofoutofplacefeatures(url):
# pdb.set_trace()
if url[:4]=="http":
r = requests.get(url)
else:
url="http://"+url
r = requests.get(url)
#r = requests.get(url)
data = r.text
data2=r.content
document, errors = tidy_document(data,
options={'numeric-entities':1})
#print document
#print errors
#print "Number of Elements Out of Place : " + str(len(errors))
return len(errors)
def reg_date(url):
url=url.strip("www.")
print url
ur="http://www.whois.com/whois/"+url
r = requests.get(ur)
data = r.content.decode("utf-8")
#print data
try :
soup = BeautifulSoup(data)
#<div class="whois_result"
for link in soup.find_all("div",{"class":"whois_result"}):
site = link.get_text().lower()
print site.decode("utf-8")
print "\n date is \n"
print re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
except:
pass
def nofoutofplacefeatures(url):
try:
# pdb.set_trace()
if url[:4]=="http":
r = requests.get(url)
else:
url="http://"+url
r = requests.get(url)
#r = requests.get(url)
data = r.text
data2=r.content
document, errors = tidy_document(data,
options={'numeric-entities':1})
#print document
#print errors
#print "Number of Elements Out of Place : " + str(len(errors))
return len(errors)
except:
pass
def reg_date(url):
url=url.strip("www.")
#print url
ur="http://www.whois.com/whois/"+url
r = requests.get(ur)
data = r.content.decode("utf-8")
#print data
try :
soup = BeautifulSoup(data,"lxml")
#<div class="whois_result"
for link in soup.find_all("div",{"class":"whois_result"}):
site = link.get_text().lower()
#print site.decode("utf-8")
print "\n Domain registration date is " + re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
except:
pass
feature_extraction.py 文件源码
项目:Malicious_Website_Detection
作者: medhini
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def location(url):
fdata={'Accept':'*/*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Content-Length':'29',
'Content-type':'application/x-www-form-urlencoded',
'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
'Host':'get-site-ip.com',
'Origin':'http://get-site-ip.com',
'Referer':'http://get-site-ip.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url})
#print response.content
soup=BeautifulSoup(response.content,"lxml")
#print "Location : "
for i in soup.find_all("div", { "class" :"response"}):
# print i.get_text()
# print i.get_text().split('-')[2].replace(' ','')
return i.get_text().split('-')[2].replace(' ','')
#Finds number of special characters
feature_extraction.py 文件源码
项目:Malicious_Website_Detection
作者: medhini
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def nofoutofplacefeatures(url):
# pdb.set_trace()
if url[:4]=="http":
r = requests.get(url)
else:
url="http://"+url
r = requests.get(url)
#r = requests.get(url)
data = r.text
data2=r.content
document, errors = tidy_document(data,
options={'numeric-entities':1})
#print document
#print errors
#print "Number of Elements Out of Place : " + str(len(errors))
return len(errors)
def read_data(source):
"""
Reads the sentence data from the csv file, which is of the form (sentence, is_summary_sentence).
Args:
source = the data file to read the data from
Returns:
A list of tuples where each tuple is of the form (sentence, is_summary_sentence).
"""
sentences = []
count = 0
with open(source, "r") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
sentence = row[0]
sentence = sentence.strip("\"")
sentence = sentence.strip("[")
sentence = sentence.strip("]")
sentence = sentence.replace("'", "")
sentence = sentence.replace(" ", "")
sentence = sentence.split(",")
sentences.append(sentence)
count += 1
return sentences
# ============================================
# ================ MAIN PROGRAM ==============
# Read in all of the papers into a list of lists. Each item in the list is a sentence, in the form of a list of words.
smell_datamine_multiprocessing.py 文件源码
项目:Smelly-London
作者: Smelly-London
项目源码
文件源码
阅读 44
收藏 0
点赞 0
评论 0
def tokenize_to_sentence(text):
parser = nltk.data.load('tokenizers/punkt/english.pickle')
# split into sentences
sentences = parser.tokenize(text.strip())
return [lemmatize_sentence(sentence) for sentence in sentences]
smell_datamine_multiprocessing.py 文件源码
项目:Smelly-London
作者: Smelly-London
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def getMeta(self, fileName):
"""Return the meta data for a given fileName e.g year, url, MOH, borough, bID. """
splitReport = fileName.split('.')
bID = splitReport[2]
year = splitReport[1]
url = self.getUrl(bID)
try:
region = mapping[bID][1]
mohRegion = mapping[bID][0]
except:
# TODO there is a problem with mappings e.g Acton.1915.b19783905.txt. Region cannot be found
print(fileName)
return (None, None, None, None, None)
return year, region, bID, url, mohRegion
def get_app_data(app_id):
url = 'http://store.steampowered.com/api/appdetails?appids=' + str(app_id)
response = urllib.urlopen(url)
try:
data = json.loads(response.read())
if not data[str(app_id)]['success'] or data[str(app_id)]['data']['type'] != 'game':
return None
return data[str(app_id)]
except:
return None
def get_apps():
url = 'http://api.steampowered.com/ISteamApps/GetAppList/v2/'
response = urllib.urlopen(url)
try:
data = json.loads(response.read())
apps = data['applist']['apps']
return apps
except:
return None
def get_description_from_app_data(app_data):
description = clean_string(app_data['data']['detailed_description'])
sentences = SENTENCE_DETECTOR.tokenize(description.strip())
if len(sentences) > 0:
sentences = sentences[0:(min(3, len(sentences)))]
sentences = [x for x in sentences if len(x.split(' ')) > 5 and not x.split(' ')[0].isupper() and x.find('\r') == -1]
combined_sentence = ' '.join(sentences)
if len(combined_sentence) == 0 or not combined_sentence[0].isalpha() or len(combined_sentence.split(' ')) < 5:
return None
return combined_sentence
return None
def get_title_from_app_data(app_data):
return clean_string(app_data['data']['name'])