python类download()的实例源码

NewsAutosummarize.py 文件源码 项目:Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def get_only_text_washingtonpost_url(url):
    # this func will take the URL as an argument and return only
    # the raw text of the url.
    # this function works specifically for the washPost articles
    # because we know the structure of the pages
    page = urllib.urlopen(url).read().decode('utf8')
    # we download the URL
    soup = BeautifulSoup(page)
    # initialize a beautifulsoup object with the page we downloaded
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # the above gets everything bewteen a pair of HTML tags
    # that look a certain way e.g. <article> stuff</article>
    # the above format is specific to the washington post
    soup2 = BeautifulSoup(text)
    # find all the paragraph tage <p>
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

#######################################################################

# TEST
######################################################################
SuggestionMiningDL.py 文件源码 项目:NUIG-suggestion 作者: MixedEmotions 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def activate(self, *args, **kwargs):

        np.random.seed(1337)  # for reproducibility

        st = datetime.now()
        self._classifierModel = load_model(self.savedModelPath)       
        logger.info("{} {}".format(datetime.now() - st, "loaded _classifierModel"))

        st = datetime.now()
        self._tokenizer = self.get_tokenizer()
        logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer"))

        #st = datetime.now()
        #nltk.download()
        #self._tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle')
        #logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer_nltk"))

        logger.info("SuggestionMiningDL plugin is ready to go!")
utils.py 文件源码 项目:R-net 作者: matthew-z 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def prepare_data():
    make_dirs("data/cache")
    make_dirs("data/embedding/char")
    make_dirs("data/embedding/word")
    make_dirs("data/squad")
    make_dirs("data/trained_model")
    make_dirs("checkpoint")

    nltk.download("punkt")

    train_filename = "train-v1.1.json"
    dev_filename = "dev-v1.1.json"
    squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/"

    train_url = os.path.join(squad_base_url, train_filename)
    dev_url = os.path.join(squad_base_url, dev_filename)

    download_prefix = os.path.join("data", "squad")
    maybe_download(train_url, download_prefix, train_filename)
    maybe_download(dev_url, download_prefix, dev_filename)

    char_embedding_pretrain_url = "https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt"
    char_embedding_filename = "glove_char.840B.300d.txt"
    maybe_download(char_embedding_pretrain_url, "data/embedding/char", char_embedding_filename)
download.py 文件源码 项目:textkit 作者: learntextvis 项目源码 文件源码 阅读 65 收藏 0 点赞 0 评论 0
def download():
    '''
    Install required libraries.
    Note this library will install nltk dependencies into your
    user directory.
    '''

    click.echo("Installing nltk packages into your user directories in " +
               "the following order of existence (first found):\n" +
               '\n'.join(nltk.data.path))

    extensions = [("taggers", "averaged_perceptron_tagger"),
                  ("corpora", "wordnet"),
                  ("tokenizers", "punkt")]

    missing = check_packages_exist(extensions)

    for ext_tuple in missing:
        nltk.download(ext_tuple[1])
esa_jobtitle_normalizer.py 文件源码 项目:skills-ml 作者: workforce-data-initiative 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def retrieve_onet_titles(self):
        onet_titles = pd.concat(
            (pd.read_csv(self.onet_downloader.download(
                version,
                'Occupation Data.txt',
                'occupation_data.txt'
            ), sep='\t') for version in ONET_VERSIONS),
            ignore_index=True
        )
        # Assumes pandas 0.19, keeps newest duplicate Title
        onet_titles.drop_duplicates('Title', inplace=True, keep='last')
        onet_titles['Major'] = onet_titles.iloc[:, 0].apply(lambda x: x[:2])

        LOWER = True
        if LOWER:
            # all RDD strings are unicode
            onet_titles['Title'] = onet_titles['Title'].str.lower()
            onet_titles['Description'] = onet_titles['Description'].str.lower()

        # now we can do a title -> Major, Minor lookup
        onet_titles.set_index('Title', inplace=True)
        # access with onet_titles.loc[u'Sales Agents, Financial Services']
        return onet_titles
embeddings_dict.py 文件源码 项目:deeppavlov 作者: deepmipt 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def __init__(self, opt, embedding_dim):
        """Initialize the class according to given parameters."""

        self.tok2emb = {}
        self.embedding_dim = embedding_dim
        self.opt = copy.deepcopy(opt)
        self.load_items()

        nltk.download('punkt')

        if not self.opt.get('fasttext_model'):
            raise RuntimeError('No pretrained fasttext model provided')
        self.fasttext_model_file = self.opt.get('fasttext_model')
        if not os.path.isfile(self.fasttext_model_file):
            emb_path = os.environ.get('EMBEDDINGS_URL')
            if not emb_path:
                raise RuntimeError('No pretrained fasttext model provided')
            fname = os.path.basename(self.fasttext_model_file)
            try:
                print('Trying to download a pretrained fasttext model from the repository')
                url = urllib.parse.urljoin(emb_path, fname)
                urllib.request.urlretrieve(url, self.fasttext_model_file)
                print('Downloaded a fasttext model')
            except Exception as e:
                raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e)

        self.fasttext_model = fasttext.load_model(self.fasttext_model_file)
setup.py 文件源码 项目:goose 作者: sexxis 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def main():
    nltk_deps = ['punkt', 'averaged_perceptron_tagger']
    print 'Checking nltk deps...'
    map(nltk.download, nltk_deps)
    print 'nltk deps done'
tokenizer.py 文件源码 项目:tokenquery 作者: ramtinms 项目源码 文件源码 阅读 46 收藏 0 点赞 0 评论 0
def __init__(self, tokenizer_type="PTBTokenizer"):

        # Sanity checks
        if tokenizer_type in ['SpaceTokenizer', 'NLTKWhiteSpaceTokenizer', 'PTBTokenizer']:
            self.tokenizer_type = tokenizer_type
        else:
            print ("Unrecognized tokenizer type : setting back to default (PTBTokenizer)")
            self.tokenizer_type = "PTBTokenizer"
        try:
            nltk.data.find('punkt.zip')
        except LookupError:
            nltk.download('punkt')
pos_tagger.py 文件源码 项目:tokenquery 作者: ramtinms 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self):
        try:
            nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
        except LookupError:
            nltk.download('averaged_perceptron_tagger')
setup.py 文件源码 项目:UrbanSearch 作者: urbansearchTUD 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def load_nltk_data():
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('snowball_data')
nltkdatafiles.py 文件源码 项目:coquery 作者: gkunter 项目源码 文件源码 阅读 49 收藏 0 点赞 0 评论 0
def download_packages(self):
        import nltk

        for x in [comp for comp in self._missing if "/" in comp]:
            package = x.split("/")[1]
            self.updateLabel.emit(package)
            nltk.download(package, raise_on_error=True)
            self.progressTheBar.emit()
__init__.py 文件源码 项目:fabric8-analytics-tagger 作者: fabric8-analytics 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def prepare():
    """Prepare tagger for run.

    This should be after installation to initialize tagger's resources.
    """
    import nltk
    import requests
    from libarchive import extract_memory
    import os
    from shutil import move
    from f8a_tagger.utils import get_files_dir

    nltk.download("punkt")
    nltk.download("wordnet")

    maven_index_checker_url = 'https://github.com/fabric8-analytics/' \
                              'maven-index-checker/files/1275145/' \
                              'maven-index-checker-v0.1-alpha.zip'
    response = requests.get(maven_index_checker_url)
    if response.ok is not True:
        raise RemoteDependencyMissingError("Failed to download maven-index-checker with "
                                           "response code %s",
                                           response.status_code)

    # Unfortunately no way how to know name or path of extracted file,
    # so assume it's maven-index-checker.jar
    jar_name = "maven-index-checker.jar"

    jar_path = get_files_dir()
    extract_memory(response.content)
    move(jar_name, os.path.join(jar_path, jar_name))
setup.py 文件源码 项目:presswork 作者: hangtwenty 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def run(self):
        # setuptools is an oldie goldie. super() is not supported by base class (it's an "old style class")
        SetuptoolsInstallCommand.do_egg_install(self)

        import nltk
        for corpus in _required_nltk_corpora:
            nltk.download(corpus)
nltk_normalization.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 42 收藏 0 点赞 0 评论 0
def install_nltk_corpora(*packages):
        nltk_packages = list(packages)
        try:
            installed = (set(os.listdir(nltk.data.find("corpora"))) |
                         (set(os.listdir(nltk.data.find("taggers"))))) | \
                        (set(os.listdir(nltk.data.find("tokenizers"))))
        except LookupError:
            installed = set()
        if not set(nltk_packages) <= set(installed):
            nltk.download(nltk_packages)
setup.py 文件源码 项目:bsd 作者: cjhutto 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def run(self):
        # PUT YOUR POST-INSTALL SCRIPT HERE or CALL A FUNCTION
        import nltk
        nltk.download('punkt')
        install.run(self)
resources.py 文件源码 项目:wordsim 作者: recski 项目源码 文件源码 阅读 77 收藏 0 点赞 0 评论 0
def ensure_nltk_packages():
    for package in ('stopwords', 'punkt', 'wordnet'):
        nltk.download(package, quiet=True)
helpers.py 文件源码 项目:bigworldgraph 作者: majdigital 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def download_nltk_resource_if_missing(resource_path, resource):
    """
    Download a missing resource from the Natural Language Processing Toolkit.

    :param resource_path: Link / path for NLTK resource.
    :type resource_path: str
    :param resource: Identifier / name of resource (will be used to download the resource if its not found).
    :type resource: str
    """
    try:
        nltk.data.find(resource_path)
    except LookupError:
        nltk.download(resource)
reuters.py 文件源码 项目:MachineLearningProject 作者: ymynem 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def download():
    """
    Download reuters data and stopwords if not already present"
    """
    nltk.download("reuters")
    nltk.download("stopwords")
easy_lda.py 文件源码 项目:Easy-Latent-Dirichlet-Allocation 作者: bjherger 项目源码 文件源码 阅读 50 收藏 0 点赞 0 评论 0
def __init__(self, num_topics=6, num_iterations=500, random_state=None, clean_text=True, vectorizer=None):
        """
        Init for LDA estimator
        :param num_topics: Number of topics to model (generally 3-10)
        :type num_topics: int
        :param num_iterations: Number of iterations to allow before locking in topics
        :type num_iterations: int
        :param random_state: Random seed, for consistent topics
        :type random_state: int
        :param clean_text: Whether to clean text using self.preprocess(). Recommended if you have not preprocessed
        the text already
        :type clean_text: bool
        :param vectorizer: Word vectorizer to use. The word vectorizer should convert a collection of text documents
        to a matrix of token counts
        """
        self.num_topics = num_topics
        self.num_iterations = num_iterations
        self.random_state = random_state
        self.lda_model = lda.LDA(n_topics=self.num_topics, n_iter=self.num_iterations, random_state=self.random_state)
        self.clean_text = clean_text
        self.get_topic_description_df = None
        if vectorizer is not None:
            self.vectorizer = vectorizer
        else:
            self.vectorizer = CountVectorizer()

        # Make sure nltk has required data sets
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
esa_jobtitle_normalizer.py 文件源码 项目:skills-ml 作者: workforce-data-initiative 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def __init__(self, onet_source=OnetSourceDownloader):
        self.onet_downloader = onet_source()
        self.onet_titles = self.retrieve_onet_titles()
        logging.info('Retrieved onet titles')
        # ... Following the ESA description:
        # https://en.wikipedia.org/wiki/Explicit_semantic_analysis
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        # optimization note: convert from CSR to CSC
        self.tf = self.tfidf_vectorizer.fit_transform(self.onet_titles['Description'].values)
        self.concept_row = self.onet_titles.index.values
        try:
            wn.synset
        except LookupError:
            nltk.download('wordnet')
download_corpora.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each)
download_corpora.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each)
analyse.py 文件源码 项目:Political-Opinion-Finder 作者: philhabell 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def nltkDownload(self):
        try:
            nltk.data.find("tokenizers")
        except LookupError:
            #self.dis.spinner("Downloading NLTK Data")
            print("No NLTK data found, downloading now...")
            nltk.download("all")
            #self.dis.stop()


    # The searcher find tweets in the database with with the search term handed
    # to it with. It will return the tweets the term and number of times it 
    # apeares in the database in a dictionary.
    # It must be handed:
    #    *a search term as a string
untitled-3.py 文件源码 项目:Twitter 作者: LucasRodriguez 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def run():
    nltk.download('punkt')
download_corpora.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each)
download_corpora.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each)
setup.py 文件源码 项目:redbiom 作者: biocore 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def _post():
    import nltk
    nltk.download('stopwords')
    nltk.download('punkt')
op_sentence.py 文件源码 项目:WebNav 作者: nyu-dl 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, wiki, vocab, n_consec):
        self.wiki = wiki
        self.vocab = vocab
        self.n_consec = n_consec # number of consecutive sections that are used to form a query
        nltk.download('punkt')
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
iwords.py 文件源码 项目:ip6words 作者: lstn 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def nltk_download_packages():
    nltk.download("words")
    nltk.download("brown")
    nltk.download("abc")
    nltk.download("inaugural")
    nltk.download("genesis")
text_processor.py 文件源码 项目:RNNVis 作者: myaooo 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def tokenize(str_stream, eos=True, remove_punct=False):
    """
    Given a str or str_stream (f.read()) convert the str to a list of sentences,
        e.g.: [[word, word], [word, word, ...], ...]
    :param str_stream: a str or a str_stream
    :param eos: wether turns '.' into <eos> tag
    :param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'"
    :return: a list of sentences, each sentence is a list of words (str)
    """
    # do lazy import coz import nltk is very slow
    import nltk
    try:
        nltk.data.load('tokenizers/punkt/english.pickle')
    except LookupError:
        print('punct resource not found, using nltk.download("punkt") to download resource data...')
        nltk.download('punkt')
    tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())]
    # get POS Tags
    tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal')
    pos_tags = []
    for token_tags in tokens_tags:
        _, tags = zip(*token_tags)
        pos_tags.append(tags)
    # tag number
    tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens]
    if eos:
        for token in tokens:
            token[-1] = '<eos>'
    if remove_punct:
        tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens]
    return tokens, pos_tags


问题


面经


文章

微信
公众号

扫码关注公众号