def split(p):
output = os.path.join(get_data_home(), "kddcup.parq")
if not os.path.exists(output):
dtype = {
1: 'category',
2: 'category',
3: 'category',
41: 'category',
}
df = pd.read_csv(p, header=None, dtype=dtype)
cat_cols = df.select_dtypes(include=['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda col: col.cat.codes)
df.columns = list(string.ascii_letters[:len(df.columns)])
ddf = dd.from_pandas(df, npartitions=16)
ddf.to_parquet(output)
return output
python类get_data_home()的实例源码
def fetch_load_letters(data_dir=None):
path = os.path.join(get_data_home(data_dir), 'letter-recognition.data')
if not os.path.exists(path):
from urllib import request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
print('Downloading letter-recognition dataset from {}...'.format(url))
request.urlretrieve(url=url, filename=path)
else:
print('Found letter-recognition in {}!'.format(path))
X, y = [], []
with open(path) as f:
reader = csv.reader(f)
for row in reader:
y.append(row[0])
X.append(row[1:])
labels, label_idx = np.unique(y, return_inverse=True)
return np.asarray(X, dtype=float), label_idx
def download():
p = os.path.join(get_data_home(), "kddcup.data.gz")
if os.path.exists(p):
return p
r = requests.get(URL, stream=True)
with open(p, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return p
def test_data_home():
# get_data_home will point to a pre-existing folder
data_home = get_data_home(data_home=DATA_HOME)
assert_equal(data_home, DATA_HOME)
assert_true(os.path.exists(data_home))
# clear_data_home will delete both the content and the folder it-self
clear_data_home(data_home=data_home)
assert_false(os.path.exists(data_home))
# if the folder is missing it will be created again
data_home = get_data_home(data_home=DATA_HOME)
assert_true(os.path.exists(data_home))
def setup_module(module):
data_home = get_data_home()
if not exists(join(data_home, '20news_home')):
raise SkipTest("Skipping dataset loading doctests")
def setup_module(module):
data_home = get_data_home()
if not exists(join(data_home, 'lfw_home')):
raise SkipTest("Skipping dataset loading doctests")
def setup_module():
check_skip_network()
# skip the test in rcv1.rst if the dataset is not already loaded
rcv1_dir = os.path.join(get_data_home(), "RCV1")
if not os.path.exists(rcv1_dir):
raise SkipTest("Download RCV1 dataset to run this test.")
def stream_reuters_documents(data_path=None):
"""Iterate over documents of the Reuters dataset.
The Reuters archive will automatically be downloaded and uncompressed if
the `data_path` directory does not exist.
Documents are represented as dictionaries with 'body' (str),
'title' (str), 'topics' (list(str)) keys.
"""
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
'reuters21578-mld/reuters21578.tar.gz')
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
if data_path is None:
data_path = os.path.join(get_data_home(), "reuters")
if not os.path.exists(data_path):
"""Download the dataset."""
print("downloading dataset (once and for all) into %s" %
data_path)
os.mkdir(data_path)
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
if _not_in_sphinx():
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
end='')
archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
reporthook=progress)
if _not_in_sphinx():
print('\r', end='')
print("untarring Reuters dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
print("done.")
parser = ReutersParser()
for filename in glob(os.path.join(data_path, "*.sgm")):
for doc in parser.parse(open(filename, 'rb')):
yield doc
###############################################################################
# Main
# ----
#
# Create the vectorizer and limit the number of features to a reasonable
# maximum
def fetch_load_isolet(data_dir=None):
train = 'isolet1+2+3+4.data.Z'
test = 'isolet5.data.Z'
path_train = os.path.join(get_data_home(data_dir), train)
path_test = os.path.join(get_data_home(data_dir), test)
if not os.path.exists(path_train[:-2]) or not os.path.exists(path_test[:-2]):
from urllib import request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/isolet/'
if not os.path.exists(path_train[:-2]):
if not os.path.exists(path_train):
print('Downloading Isolated Letter Speech Recognition data set from {}...'.format(
url))
request.urlretrieve(url=url+train, filename=path_train)
# os.system('gzip -d ' + path_train)
decompress_z(path_train)
if not os.path.exists(path_test[:-2]):
if not os.path.exists(path_test):
print('Downloading Isolated Letter Speech Recognition data set from {}...'.format(
url))
request.urlretrieve(url=url+test, filename=path_test)
# os.system('gzip -d ' + path_test)
decompress_z(path_test)
else:
print('Found Isolated Letter Speech Recognition data set!')
xtr, ytr = [], []
with open(path_train[:-2]) as f:
reader = csv.reader(f)
for row in reader:
xtr.append(row[:-1])
ytr.append(int(float(row[-1])))
labels, ytr = np.unique(ytr, return_inverse=True)
xte, yte = [], []
with open(path_test[:-2]) as f:
reader = csv.reader(f)
for row in reader:
xte.append(row[:-1])
yte.append(int(float(row[-1])))
labels, yte = np.unique(yte, return_inverse=True)
return np.asarray(xtr, dtype=float), np.asarray(xte, dtype=float), ytr, yte
plot_out_of_core_classification.py 文件源码
项目:Parallel-SGD
作者: angadgill
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def stream_reuters_documents(data_path=None):
"""Iterate over documents of the Reuters dataset.
The Reuters archive will automatically be downloaded and uncompressed if
the `data_path` directory does not exist.
Documents are represented as dictionaries with 'body' (str),
'title' (str), 'topics' (list(str)) keys.
"""
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
'reuters21578-mld/reuters21578.tar.gz')
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
if data_path is None:
data_path = os.path.join(get_data_home(), "reuters")
if not os.path.exists(data_path):
"""Download the dataset."""
print("downloading dataset (once and for all) into %s" %
data_path)
os.mkdir(data_path)
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
if _not_in_sphinx():
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
end='')
archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
reporthook=progress)
if _not_in_sphinx():
print('\r', end='')
print("untarring Reuters dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
print("done.")
parser = ReutersParser()
for filename in glob(os.path.join(data_path, "*.sgm")):
for doc in parser.parse(open(filename, 'rb')):
yield doc
###############################################################################
# Main
###############################################################################
# Create the vectorizer and limit the number of features to a reasonable
# maximum