python类data()的实例源码-面圈网

nltkdatafiles.py 文件源码项目：coquery 作者: gkunter 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def copy_packages(self):
        import nltk.data
        target_path = nltk.data.path[0]

        for x in [comp for comp in self._missing if "/" in comp]:
            parts = x.split("/")
            subdir = os.path.join(target_path, parts[0])
            package = parts[1]
            zip_name = "{}.zip".format(package)
            self.updateLabel.emit(package)
            src = os.path.join(_NLTK_dir, zip_name)
            dst = os.path.join(subdir, zip_name)
            if not os.path.exists(subdir):
                os.makedirs(subdir)

            if os.path.exists(src):
                shutil.copyfile(src, dst)
            else:
                raise ValueError("Package file {}.zip not found in {}".format(package, _NLTK_dir))

            with zipfile.ZipFile(dst) as zipped:
                for member in zipped.infolist():
                    zipped.extract(member, subdir)

            self.progressTheBar.emit()

data_helpers.py 文件源码项目：DNN-Sentiment 作者: awjuliani 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

preprocess2.py 文件源码项目：SCDL 作者: lngvietthang 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def read_json_file(path_to_json):
    objects = []
    data = ''
    with io.open(path_to_json, 'r', encoding='utf8') as f:
        for line in f:
            if line in ['\n', '\n\r']:
                objects.append(json.loads(data))
                data = ''
            else:
                data += line
        try:
            objects.append(json.loads(data))
        except:
            return objects
    return objects

# get original sentence, compression sentence

corpus.py 文件源码项目：minke 作者: DistrictDataLabs 项目源码文件源码阅读 52 收藏 0 点赞 0 评论 0

def __init__(self, root, fileids=DOC_PATTERN, tags=None,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                    'tokenizers/punkt/english.pickle'),
                 encoding='utf8', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._good_tags = tags or self.TAGS

corpus.py 文件源码项目：minke 作者: DistrictDataLabs 项目源码文件源码阅读 52 收藏 0 点赞 0 评论 0

def docs(self, fileids=None, categories=None):
        """
        Returns the complete JSON document for every file in the corpus.
        Note that I attempted to use the nltk ``CorpusView`` and ``concat``
        methods here, but was not getting memory safe iteration. Instead the
        simple Python generator by far did a better job of ensuring that file
        handles got closed and that not all data was loaded into memory at a
        time. In the future, I will try to re-implement the corpus view.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with codecs.open(path, 'r', encoding=enc) as f:
                yield json.load(f)

chat80.py 文件源码项目：Price-Comparator 作者: Thejas-1 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

data_helpers.py 文件源码项目：DNN-Sentiment 作者: awjuliani 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    #x_text = list(open("./trainUNK.txt", "r").readlines())
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

data_helpers.py 文件源码项目：DNN-Sentiment 作者: awjuliani 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def load_data_for_books(path):
    text = ''.join(open(path).readlines()).decode('utf8')
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    book = tokenizer.tokenize(text)
    #book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
    #book = list(open(path, "r").readlines())
    book = [s.strip() for s in book]
    book = [clean_str(sent) for sent in book]
    book = [s.split(" ") for s in book]
    x_text = book
    y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
    sentences, labels = x_text,y
    sentences_padded = pad_sentences(sentences)



    sentencesT, labelsT = load_data_and_labels()
    sentences_paddedT = pad_sentences(sentencesT)
    vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv, sentencesT]

chat80.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

chat80.py 文件源码项目：neighborhood_mood_aws 作者: jarrellmark 项目源码文件源码阅读 50 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

chat80.py 文件源码项目：hate-to-hugs 作者: sdoran35 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

chat80.py 文件源码项目：FancyWord 作者: EastonLee 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

chat80.py 文件源码项目：beepboop 作者: nicolehe 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

chat80.py 文件源码项目：kind2anki 作者: prz3m 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

chat80.py 文件源码项目：but_sentiment 作者: MixedEmotions 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise

experiments.py 文件源码项目：clickbait 作者: bhargaviparanjape 项目源码文件源码阅读 48 收藏 0 点赞 0 评论 0

def add_full_stops_to_the_end(infile, outfile):
    #clean data of small titles nad add full stops for NLTK to work
    output_format = '{}.\n'.format
    with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
        for line in fin:
            if line[0] == ' ':
                pass
            #ignore headlines with less than three words
            elif len(line.split()) <= 3:
                pass
            elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
                print >> fout, line.decode('utf-8'),
            else:
                print >> fout, output_format(line.strip()).decode('utf-8'),



############################################
#   Convert All except first word and quotes
#   to lower case                          #
############################################

testextraction.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def location(url):
    fdata={'Accept':'*/*',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'en-US,en;q=0.8',
    'Connection':'keep-alive',
    'Content-Length':'29',
    'Content-type':'application/x-www-form-urlencoded',
    'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
    'Host':'get-site-ip.com',
    'Origin':'http://get-site-ip.com',
    'Referer':'http://get-site-ip.com/',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url})
    #print response.content
    soup=BeautifulSoup(response.content,"lxml")
    #print "Location : "
    for i in soup.find_all("div", { "class" :"response"}):
    #   print i.get_text()
    #   print i.get_text().split('-')[2].replace(' ','')
        return i.get_text().split('-')[2].replace(' ','')

#Finds number of special characters

testextraction.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def nofoutofplacefeatures(url):


#   pdb.set_trace()

    if url[:4]=="http":
        r = requests.get(url)
    else:
        url="http://"+url
        r  = requests.get(url)

    #r = requests.get(url)
    data = r.text
    data2=r.content

    document, errors = tidy_document(data,
      options={'numeric-entities':1})

    #print document
    #print errors
    #print "Number of Elements Out of Place : " + str(len(errors))
    return len(errors)

testextraction.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def reg_date(url):
    url=url.strip("www.")
    print url
    ur="http://www.whois.com/whois/"+url
    r = requests.get(ur)
    data = r.content.decode("utf-8")

    #print data
    try :
        soup = BeautifulSoup(data)
        #<div class="whois_result" 
        for link in soup.find_all("div",{"class":"whois_result"}):
            site = link.get_text().lower()
            print site.decode("utf-8")
            print "\n date is \n"
            print re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
            return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
    except:
        pass

realtestmodel.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 112 收藏 0 点赞 0 评论 0

def nofoutofplacefeatures(url):
    try:


    #   pdb.set_trace()

        if url[:4]=="http":
            r = requests.get(url)
        else:
            url="http://"+url
            r  = requests.get(url)

        #r = requests.get(url)
        data = r.text
        data2=r.content

        document, errors = tidy_document(data,
          options={'numeric-entities':1})

        #print document
        #print errors
        #print "Number of Elements Out of Place : " + str(len(errors))
        return len(errors)
    except:
        pass

realtestmodel.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def reg_date(url):
    url=url.strip("www.")
    #print url
    ur="http://www.whois.com/whois/"+url
    r = requests.get(ur)
    data = r.content.decode("utf-8")

    #print data
    try :
        soup = BeautifulSoup(data,"lxml")
        #<div class="whois_result" 
        for link in soup.find_all("div",{"class":"whois_result"}):
            site = link.get_text().lower()
            #print site.decode("utf-8")
            print "\n Domain registration date is " + re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]

            return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1]
    except:
        pass

feature_extraction.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def location(url):
    fdata={'Accept':'*/*',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'en-US,en;q=0.8',
    'Connection':'keep-alive',
    'Content-Length':'29',
    'Content-type':'application/x-www-form-urlencoded',
    'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
    'Host':'get-site-ip.com',
    'Origin':'http://get-site-ip.com',
    'Referer':'http://get-site-ip.com/',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url})
    #print response.content
    soup=BeautifulSoup(response.content,"lxml")
    #print "Location : "
    for i in soup.find_all("div", { "class" :"response"}):
    #   print i.get_text()
    #   print i.get_text().split('-')[2].replace(' ','')
        return i.get_text().split('-')[2].replace(' ','')

#Finds number of special characters

feature_extraction.py 文件源码项目：Malicious_Website_Detection 作者: medhini 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def nofoutofplacefeatures(url):


#   pdb.set_trace()

    if url[:4]=="http":
        r = requests.get(url)
    else:
        url="http://"+url
        r  = requests.get(url)

    #r = requests.get(url)
    data = r.text
    data2=r.content

    document, errors = tidy_document(data,
      options={'numeric-entities':1})

    #print document
    #print errors
    #print "Number of Elements Out of Place : " + str(len(errors))
    return len(errors)

paper_word2vec.py 文件源码项目：scientific-paper-summarisation 作者: EdCo95 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def read_data(source):
    """
    Reads the sentence data from the csv file, which is of the form (sentence, is_summary_sentence).
    Args:
        source = the data file to read the data from
    Returns:
        A list of tuples where each tuple is of the form (sentence, is_summary_sentence).
    """

    sentences = []
    count = 0
    with open(source, "r") as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            sentence = row[0]
            sentence = sentence.strip("\"")
            sentence = sentence.strip("[")
            sentence = sentence.strip("]")
            sentence = sentence.replace("'", "")
            sentence = sentence.replace(" ", "")
            sentence = sentence.split(",")
            sentences.append(sentence)
            count += 1

    return sentences


# ============================================

# ================ MAIN PROGRAM ==============


# Read in all of the papers into a list of lists. Each item in the list is a sentence, in the form of a list of words.

smell_datamine_multiprocessing.py 文件源码项目：Smelly-London 作者: Smelly-London 项目源码文件源码阅读 51 收藏 0 点赞 0 评论 0

def tokenize_to_sentence(text):
    parser = nltk.data.load('tokenizers/punkt/english.pickle')
    # split into sentences
    sentences = parser.tokenize(text.strip())
    return [lemmatize_sentence(sentence) for sentence in sentences]

smell_datamine_multiprocessing.py 文件源码项目：Smelly-London 作者: Smelly-London 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def getMeta(self, fileName):
        """Return the meta data for a given fileName e.g year, url, MOH, borough, bID.  """
        splitReport = fileName.split('.')
        bID = splitReport[2]
        year = splitReport[1]
        url = self.getUrl(bID)
        try:
            region = mapping[bID][1]
            mohRegion = mapping[bID][0]
        except:
            # TODO there is a problem with mappings e.g Acton.1915.b19783905.txt. Region cannot be found
            print(fileName)
            return (None, None, None, None, None)
        return year, region, bID, url, mohRegion

download.py 文件源码项目：steam_game_generator 作者: applepinegames 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def get_app_data(app_id):
  url = 'http://store.steampowered.com/api/appdetails?appids=' + str(app_id)
  response = urllib.urlopen(url)
  try:
    data = json.loads(response.read())
    if not data[str(app_id)]['success'] or data[str(app_id)]['data']['type'] != 'game':
      return None
    return data[str(app_id)]
  except:
    return None

download.py 文件源码项目：steam_game_generator 作者: applepinegames 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def get_apps():
  url = 'http://api.steampowered.com/ISteamApps/GetAppList/v2/'
  response = urllib.urlopen(url)
  try:
    data = json.loads(response.read())
    apps = data['applist']['apps']
    return apps
  except:
    return None

download.py 文件源码项目：steam_game_generator 作者: applepinegames 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def get_description_from_app_data(app_data):
  description = clean_string(app_data['data']['detailed_description'])
  sentences = SENTENCE_DETECTOR.tokenize(description.strip())
  if len(sentences) > 0:
    sentences = sentences[0:(min(3, len(sentences)))]
    sentences = [x for x in sentences if len(x.split(' ')) > 5 and not x.split(' ')[0].isupper() and x.find('\r') == -1]
    combined_sentence = ' '.join(sentences)
    if len(combined_sentence) == 0 or not combined_sentence[0].isalpha() or len(combined_sentence.split(' ')) < 5:
      return None
    return combined_sentence
  return None

download.py 文件源码项目：steam_game_generator 作者: applepinegames 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def get_title_from_app_data(app_data):
  return clean_string(app_data['data']['name'])