def split_file(source, nfolds=None, ignoreheader=False, importance=0, minfoldsize=10000):
if nfolds is None:
nfolds = 10
if isinstance(source, basestring):
ext = get_real_ext(source)
else:
ext = 'xxx'
if hasattr(source, 'seek'):
source.seek(0)
# XXX already have examples_count
total_lines = 0
for line in open_regular_or_compressed(source):
total_lines += 1
if hasattr(source, 'seek'):
source.seek(0)
source = open_regular_or_compressed(source)
if ignoreheader:
source.next()
total_lines -= 1
foldsize = int(math.ceil(total_lines / float(nfolds)))
foldsize = max(foldsize, minfoldsize)
nfolds = int(math.ceil(total_lines / float(foldsize)))
folds = []
current_fold = -1
count = foldsize
current_fileobj = None
total_count = 0
for line in source:
if count >= foldsize:
if current_fileobj is not None:
flush_and_close(current_fileobj)
current_fileobj = None
current_fold += 1
if current_fold >= nfolds:
break
fname = get_temp_filename('fold%s.%s' % (current_fold, ext))
current_fileobj = open(fname, 'w')
count = 0
folds.append(fname)
current_fileobj.write(line)
count += 1
total_count += 1
if current_fileobj is not None:
flush_and_close(current_fileobj)
if total_count != total_lines:
sys.exit('internal error: total_count=%r total_lines=%r source=%r' % (total_count, total_lines, source))
return folds, total_lines
评论列表
文章目录