def load_movie_reviews():
# movie_reviews is a sizeable corpus to import, so only load it if we have to
from nltk.corpus import movie_reviews
try:
movie_reviews.categories()
except:
import nltk
print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
raw_data = []
# NLTK's corpus is structured in an interesting way
# first iterate through the two categories (pos and neg)
for category in movie_reviews.categories():
if category == 'pos':
pretty_category_name = 'positive'
elif category == 'neg':
pretty_category_name = 'negative'
# each of these categories is just fileids, so grab those
for fileid in movie_reviews.fileids(category):
# then each review is a NLTK class where each item in that class instance is a word
review_words = movie_reviews.words(fileid)
review_text = ''
for word in review_words:
review_text += ' ' + word
review_dictionary = {
'text': review_text,
'sentiment': pretty_category_name
}
raw_data.append(review_dictionary)
return raw_data
python类download()的实例源码
def download():
"""skip unverified certificate and show download dialog"""
try:
create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = create_unverified_https_context
nltk.download()
def download(self, name: str) -> None:
if not self.exists(name):
nltk.download(name, download_dir=self.nltk_dir)
def run(self):
import nltk
from memex_dossier.models.tests.test_features import nltk_data_packages
for data_name in nltk_data_packages:
print('nltk.download(%r)' % data_name)
nltk.download(data_name)
def nltk_data():
for data_name in nltk_data_packages:
print('nltk.download(%r)' % data_name)
nltk.download(data_name)
def setup_nltk(self, **kw):
import nltk
from nltk.data import find
tagger = "averaged_perceptron_tagger"
try:
find("taggers/%s" % tagger)
except LookupError:
click.echo("Downloading NTLK data (~2MB)...")
nltk.download(tagger)
return True
return False
def initstopwords(self):
try:
s=set(stopwords.words('english'))
except LookupError as e:
import nltk
nltk.download()
s=set(stopwords.words('english'))
st = LancasterStemmer()
for each in s:
self.stopwords.append(st.stem(each))
#Given a dictionary of key: frequency, value: array of words
#build the opposite
def download_lite():
for each in MIN_CORPORA:
nltk.download(each)
def download_all():
for each in ALL_CORPORA:
nltk.download(each)
def install():
for d in dependencies:
pip.main(['install', d])
# after nltk module was installed
import nltk
for data in nltk_data:
nltk.download(data)
def install_nltk_corpora(*packages):
nltk_packages = list(packages)
try:
installed = (set(os.listdir(nltk.data.find("corpora"))) |
(set(os.listdir(nltk.data.find("taggers"))))) | \
(set(os.listdir(nltk.data.find("tokenizers"))))
except LookupError:
installed = set()
if not set(nltk_packages) <= set(installed):
nltk.download(nltk_packages)
def build_dict_from_nltk(output_file, corpus=None, stopwords=None,
stemmer=Stemmer(), measure='IDF', verbose=False):
'''
@param output_file: the name of the file where the dictionary should be
saved
@param corpus: the NLTK corpus to use (defaults to nltk.corpus.reuters)
@param stopwords: a list of (not stemmed) stopwords (defaults to
nltk.corpus.reuters.words('stopwords'))
@param stemmer: the L{Stemmer} object to be used
@param measure: the measure used to compute the weights ('IDF'
i.e. 'inverse document frequency' or 'ICF' i.e.
'inverse collection frequency'; defaults to 'IDF')
@param verbose: whether information on the progress should be printed
on screen
'''
from build_dict import build_dict
import nltk
import pickle
if not (corpus and stopwords):
nltk.download('reuters')
corpus = corpus or nltk.corpus.reuters
stopwords = stopwords or nltk.corpus.reuters.words('stopwords')
corpus_list = []
if verbose: print 'Processing corpus...'
for file in corpus.fileids():
doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file)
if w[0].isalpha()]
corpus_list.append(doc)
if verbose: print 'Processing stopwords...'
stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords]
if verbose: print 'Building dictionary... '
dictionary = build_dict(corpus_list, stopwords, measure)
with open(output_file, 'wb') as out:
pickle.dump(dictionary, out, -1)
def download_lite():
for each in MIN_CORPORA:
nltk.download(each)
def download_all():
for each in ALL_CORPORA:
nltk.download(each)
def download():
nltk.download()
def download_nltk_data(package_name='all'):
""" download necessary data from NLTK
args:
package_name: string containing the package name to install
returns:
None
"""
if package_name is 'all':
data = ['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger']
for package in data:
download(package)
else:
download(package)
def create_data_paths():
if not os.path.isdir(DATA_DIR):
raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR)
needed_paths = [
os.path.join(DATA_DIR, 'samples'),
os.path.join(DATA_DIR, 'val_samples'),
os.path.join(DATA_DIR, 'Models'),
]
for p in needed_paths:
make_sure_path_exists(p)
# adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python
def _sentence_tokenizer(self, language):
try:
path = to_string("tokenizers/punkt/%s.pickle") % to_string(language)
return nltk.data.load(path)
except (LookupError, zipfile.BadZipfile):
raise LookupError(
"NLTK tokenizers are missing. Download them by following command: "
'''python -c "import nltk; nltk.download('punkt')"'''
)
def english_sentence_segment(text):
"""segment text into sentence"""
try:
sent_detector = nltk.data.load(
'tokenizers/punkt/english.pickle'
)
extra_abbrev = ["e.g", "al", "i.e"]
sent_detector._params.abbrev_types.update(extra_abbrev)
return sent_detector.tokenize(text)
except LookupError as e:
raise LookupError(
"NLTK tokenizers are missing. Download them by following command: "
'''python -c "import nltk; nltk.download('punkt')"'''
)
def download_preferences(self):
import nltk # importing the natural language processing module
nltk.download() # opening the gui based Natural language processing download kit