def get_only_text_washingtonpost_url(url):
# this func will take the URL as an argument and return only
# the raw text of the url.
# this function works specifically for the washPost articles
# because we know the structure of the pages
page = urllib.urlopen(url).read().decode('utf8')
# we download the URL
soup = BeautifulSoup(page)
# initialize a beautifulsoup object with the page we downloaded
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
# the above gets everything bewteen a pair of HTML tags
# that look a certain way e.g. <article> stuff</article>
# the above format is specific to the washington post
soup2 = BeautifulSoup(text)
# find all the paragraph tage <p>
text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
return soup.title.text, text
#######################################################################
# TEST
######################################################################
python类download()的实例源码
NewsAutosummarize.py 文件源码
项目:Python-Scripts-Repo-on-Data-Science
作者: qalhata
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def activate(self, *args, **kwargs):
np.random.seed(1337) # for reproducibility
st = datetime.now()
self._classifierModel = load_model(self.savedModelPath)
logger.info("{} {}".format(datetime.now() - st, "loaded _classifierModel"))
st = datetime.now()
self._tokenizer = self.get_tokenizer()
logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer"))
#st = datetime.now()
#nltk.download()
#self._tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle')
#logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer_nltk"))
logger.info("SuggestionMiningDL plugin is ready to go!")
def prepare_data():
make_dirs("data/cache")
make_dirs("data/embedding/char")
make_dirs("data/embedding/word")
make_dirs("data/squad")
make_dirs("data/trained_model")
make_dirs("checkpoint")
nltk.download("punkt")
train_filename = "train-v1.1.json"
dev_filename = "dev-v1.1.json"
squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"
train_url = os.path.join(squad_base_url, train_filename)
dev_url = os.path.join(squad_base_url, dev_filename)
download_prefix = os.path.join("data", "squad")
maybe_download(train_url, download_prefix, train_filename)
maybe_download(dev_url, download_prefix, dev_filename)
char_embedding_pretrain_url = "https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt"
char_embedding_filename = "glove_char.840B.300d.txt"
maybe_download(char_embedding_pretrain_url, "data/embedding/char", char_embedding_filename)
def download():
'''
Install required libraries.
Note this library will install nltk dependencies into your
user directory.
'''
click.echo("Installing nltk packages into your user directories in " +
"the following order of existence (first found):\n" +
'\n'.join(nltk.data.path))
extensions = [("taggers", "averaged_perceptron_tagger"),
("corpora", "wordnet"),
("tokenizers", "punkt")]
missing = check_packages_exist(extensions)
for ext_tuple in missing:
nltk.download(ext_tuple[1])
esa_jobtitle_normalizer.py 文件源码
项目:skills-ml
作者: workforce-data-initiative
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def retrieve_onet_titles(self):
onet_titles = pd.concat(
(pd.read_csv(self.onet_downloader.download(
version,
'Occupation Data.txt',
'occupation_data.txt'
), sep='\t') for version in ONET_VERSIONS),
ignore_index=True
)
# Assumes pandas 0.19, keeps newest duplicate Title
onet_titles.drop_duplicates('Title', inplace=True, keep='last')
onet_titles['Major'] = onet_titles.iloc[:, 0].apply(lambda x: x[:2])
LOWER = True
if LOWER:
# all RDD strings are unicode
onet_titles['Title'] = onet_titles['Title'].str.lower()
onet_titles['Description'] = onet_titles['Description'].str.lower()
# now we can do a title -> Major, Minor lookup
onet_titles.set_index('Title', inplace=True)
# access with onet_titles.loc[u'Sales Agents, Financial Services']
return onet_titles
def __init__(self, opt, embedding_dim):
"""Initialize the class according to given parameters."""
self.tok2emb = {}
self.embedding_dim = embedding_dim
self.opt = copy.deepcopy(opt)
self.load_items()
nltk.download('punkt')
if not self.opt.get('fasttext_model'):
raise RuntimeError('No pretrained fasttext model provided')
self.fasttext_model_file = self.opt.get('fasttext_model')
if not os.path.isfile(self.fasttext_model_file):
emb_path = os.environ.get('EMBEDDINGS_URL')
if not emb_path:
raise RuntimeError('No pretrained fasttext model provided')
fname = os.path.basename(self.fasttext_model_file)
try:
print('Trying to download a pretrained fasttext model from the repository')
url = urllib.parse.urljoin(emb_path, fname)
urllib.request.urlretrieve(url, self.fasttext_model_file)
print('Downloaded a fasttext model')
except Exception as e:
raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e)
self.fasttext_model = fasttext.load_model(self.fasttext_model_file)
def main():
nltk_deps = ['punkt', 'averaged_perceptron_tagger']
print 'Checking nltk deps...'
map(nltk.download, nltk_deps)
print 'nltk deps done'
def __init__(self, tokenizer_type="PTBTokenizer"):
# Sanity checks
if tokenizer_type in ['SpaceTokenizer', 'NLTKWhiteSpaceTokenizer', 'PTBTokenizer']:
self.tokenizer_type = tokenizer_type
else:
print ("Unrecognized tokenizer type : setting back to default (PTBTokenizer)")
self.tokenizer_type = "PTBTokenizer"
try:
nltk.data.find('punkt.zip')
except LookupError:
nltk.download('punkt')
def __init__(self):
try:
nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
except LookupError:
nltk.download('averaged_perceptron_tagger')
def load_nltk_data():
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('snowball_data')
def download_packages(self):
import nltk
for x in [comp for comp in self._missing if "/" in comp]:
package = x.split("/")[1]
self.updateLabel.emit(package)
nltk.download(package, raise_on_error=True)
self.progressTheBar.emit()
def prepare():
"""Prepare tagger for run.
This should be after installation to initialize tagger's resources.
"""
import nltk
import requests
from libarchive import extract_memory
import os
from shutil import move
from f8a_tagger.utils import get_files_dir
nltk.download("punkt")
nltk.download("wordnet")
maven_index_checker_url = 'https://github.com/fabric8-analytics/' \
'maven-index-checker/files/1275145/' \
'maven-index-checker-v0.1-alpha.zip'
response = requests.get(maven_index_checker_url)
if response.ok is not True:
raise RemoteDependencyMissingError("Failed to download maven-index-checker with "
"response code %s",
response.status_code)
# Unfortunately no way how to know name or path of extracted file,
# so assume it's maven-index-checker.jar
jar_name = "maven-index-checker.jar"
jar_path = get_files_dir()
extract_memory(response.content)
move(jar_name, os.path.join(jar_path, jar_name))
def run(self):
# setuptools is an oldie goldie. super() is not supported by base class (it's an "old style class")
SetuptoolsInstallCommand.do_egg_install(self)
import nltk
for corpus in _required_nltk_corpora:
nltk.download(corpus)
def install_nltk_corpora(*packages):
nltk_packages = list(packages)
try:
installed = (set(os.listdir(nltk.data.find("corpora"))) |
(set(os.listdir(nltk.data.find("taggers"))))) | \
(set(os.listdir(nltk.data.find("tokenizers"))))
except LookupError:
installed = set()
if not set(nltk_packages) <= set(installed):
nltk.download(nltk_packages)
def run(self):
# PUT YOUR POST-INSTALL SCRIPT HERE or CALL A FUNCTION
import nltk
nltk.download('punkt')
install.run(self)
def ensure_nltk_packages():
for package in ('stopwords', 'punkt', 'wordnet'):
nltk.download(package, quiet=True)
def download_nltk_resource_if_missing(resource_path, resource):
"""
Download a missing resource from the Natural Language Processing Toolkit.
:param resource_path: Link / path for NLTK resource.
:type resource_path: str
:param resource: Identifier / name of resource (will be used to download the resource if its not found).
:type resource: str
"""
try:
nltk.data.find(resource_path)
except LookupError:
nltk.download(resource)
def download():
"""
Download reuters data and stopwords if not already present"
"""
nltk.download("reuters")
nltk.download("stopwords")
def __init__(self, num_topics=6, num_iterations=500, random_state=None, clean_text=True, vectorizer=None):
"""
Init for LDA estimator
:param num_topics: Number of topics to model (generally 3-10)
:type num_topics: int
:param num_iterations: Number of iterations to allow before locking in topics
:type num_iterations: int
:param random_state: Random seed, for consistent topics
:type random_state: int
:param clean_text: Whether to clean text using self.preprocess(). Recommended if you have not preprocessed
the text already
:type clean_text: bool
:param vectorizer: Word vectorizer to use. The word vectorizer should convert a collection of text documents
to a matrix of token counts
"""
self.num_topics = num_topics
self.num_iterations = num_iterations
self.random_state = random_state
self.lda_model = lda.LDA(n_topics=self.num_topics, n_iter=self.num_iterations, random_state=self.random_state)
self.clean_text = clean_text
self.get_topic_description_df = None
if vectorizer is not None:
self.vectorizer = vectorizer
else:
self.vectorizer = CountVectorizer()
# Make sure nltk has required data sets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
esa_jobtitle_normalizer.py 文件源码
项目:skills-ml
作者: workforce-data-initiative
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def __init__(self, onet_source=OnetSourceDownloader):
self.onet_downloader = onet_source()
self.onet_titles = self.retrieve_onet_titles()
logging.info('Retrieved onet titles')
# ... Following the ESA description:
# https://en.wikipedia.org/wiki/Explicit_semantic_analysis
self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# optimization note: convert from CSR to CSC
self.tf = self.tfidf_vectorizer.fit_transform(self.onet_titles['Description'].values)
self.concept_row = self.onet_titles.index.values
try:
wn.synset
except LookupError:
nltk.download('wordnet')
download_corpora.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def download_lite():
for each in MIN_CORPORA:
nltk.download(each)
download_corpora.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 43
收藏 0
点赞 0
评论 0
def download_all():
for each in ALL_CORPORA:
nltk.download(each)
def nltkDownload(self):
try:
nltk.data.find("tokenizers")
except LookupError:
#self.dis.spinner("Downloading NLTK Data")
print("No NLTK data found, downloading now...")
nltk.download("all")
#self.dis.stop()
# The searcher find tweets in the database with with the search term handed
# to it with. It will return the tweets the term and number of times it
# apeares in the database in a dictionary.
# It must be handed:
# *a search term as a string
def run():
nltk.download('punkt')
def download_lite():
for each in MIN_CORPORA:
nltk.download(each)
def download_all():
for each in ALL_CORPORA:
nltk.download(each)
def _post():
import nltk
nltk.download('stopwords')
nltk.download('punkt')
def __init__(self, wiki, vocab, n_consec):
self.wiki = wiki
self.vocab = vocab
self.n_consec = n_consec # number of consecutive sections that are used to form a query
nltk.download('punkt')
self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def nltk_download_packages():
nltk.download("words")
nltk.download("brown")
nltk.download("abc")
nltk.download("inaugural")
nltk.download("genesis")
def tokenize(str_stream, eos=True, remove_punct=False):
"""
Given a str or str_stream (f.read()) convert the str to a list of sentences,
e.g.: [[word, word], [word, word, ...], ...]
:param str_stream: a str or a str_stream
:param eos: wether turns '.' into <eos> tag
:param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'"
:return: a list of sentences, each sentence is a list of words (str)
"""
# do lazy import coz import nltk is very slow
import nltk
try:
nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError:
print('punct resource not found, using nltk.download("punkt") to download resource data...')
nltk.download('punkt')
tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())]
# get POS Tags
tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal')
pos_tags = []
for token_tags in tokens_tags:
_, tags = zip(*token_tags)
pos_tags.append(tags)
# tag number
tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens]
if eos:
for token in tokens:
token[-1] = '<eos>'
if remove_punct:
tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens]
return tokens, pos_tags