def get_citation_df(args, text):
"""
Generate citation_df and save it to 'citations.tsv'.
"""
citation_df = pandas.DataFrame(
{'string': get_citation_strings(text)}
)
if args.citation_tags_path.is_file():
tag_df = pandas.read_table(args.citation_tags_path)
tag_df['string'] = '@tag:' + tag_df.tag
for citation in tag_df.citation:
is_valid_citation_string('@' + citation)
citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left')
else:
citation_df['citation'] = None
logging.info(f'missing {args.citation_tags_path} file: no citation tags set')
citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True)
citation_df['standard_citation'] = citation_df.citation.map(standardize_citation)
citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id)
citation_df = citation_df.sort_values(['standard_citation', 'citation'])
citation_df.to_csv(args.citations_path, sep='\t', index=False)
check_collisions(citation_df)
check_multiple_citation_strings(citation_df)
return citation_df
python类read_table()的实例源码
def gerber_green_imai():
"""
This is the dataset from Imai (2005) used to replicate and evaluate
the field experiment done by Gerber and Green (2000).
Notes
-----
.. Gerber, Alan S. and Donald P. Green. 2000. "The effects of canvassing,
telephone calls, and direct mail on voter turnout: a field experiment."
American Political Science Review 94: 653-663.
.. Gerber, Alan S. and Donald P. Green. 2005. "Correction to Gerber and Green (2000),
replication of disputed findings, and reply to Imai (2005)." American Political
Science Review 99: 301-313.
.. Imai, Kosuke. 2005. "Do get-out-the-vote calls reduce turnout? The importance of
statistical methods for field experiments." American Political Science Review 99:
283-300.
"""
fin = _os.path.join(data_dir, 'GerberGreenImai.txt')
data = pd.read_table(fin, sep = '\s+')
data.index = range(data.shape[0])
return data
def table_convert(fmt="csv"):
"""Convert the SC data into different formats.
To make available for download.
"""
# others netcdf, fits?
# https://pandas.pydata.org/pandas-docs/stable/io.html
if fmt not in ['tsv', 'csv', 'hdf']:
raise NotImplementedError("Conversion format to {} not available.".format(fmt))
name = "data/sweet-cat.{}".format(fmt)
if fmt is "tsv": # This is the standard
pass
else:
df = pd.read_table('data/sweet-cat.tsv')
if fmt == "hdf":
df.to_hdf(name, key="sweetcat", mode="w", format='table')
elif fmt == "csv":
df.to_csv(name, sep=",", index=False)
def get_info_map(info_link=INFO_LINK):
"""
Return a :class:`DataFrame` containing the information provided at
*info_link*, a link to a tab delineated text file containing
information for each USArray MT site.
"""
df = PD.read_table(info_link,
sep='\t',
skiprows=1,
names=['vnet',
'net',
'sta',
'location',
'lat',
'lon',
'elev',
'start',
'end',
'status',
'install',
'cert'],
parse_dates=[7, 8],
index_col=2)
return df
def predict(self, ifile, efile, ofile):
# Load
columns = ['documents']
data = pd.read_table(ifile, header=None, names=columns)
documents = data['documents']
# Deserialize
estimator = pickle.load(open(efile, 'rb'))
# Predict
probability = estimator.predict_proba(documents)
data['labels'] = estimator.predict(documents)
data['C1_pr'] = probability[:, 0]
data['C2_pr'] = probability[:, 1]
# Save
columns = ['labels', 'C1_pr', 'C2_pr', 'documents']
data.to_csv(
ofile,
sep = '\t',
columns = columns,
index = False
)
test_gene_overlap_barcharts.py 文件源码
项目:chip_seq_pipeline
作者: biocore-ntnu
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def expected_result_find_overlaps():
contents = u"""Chromosome Start End Peak Region
0 chr1 3 5 0 gene
1 chr1 3 5 0 tss
2 chr1 12 14 1 gene
3 chr1 200 300 2 exon
4 chr1 200 300 2 exon
5 chr1 200 300 2 gene
6 chr1 200 300 2 tes
7 chr1 200 300 2 tss
8 chr1 240 297 3 exon
9 chr1 240 297 3 gene
10 chr1 240 297 3 tes"""
return pd.read_table(StringIO(contents), header=0, sep="\s+")
def expected_result():
contents = """Sample OtherGroup
0 GENE1_KO_ChIP_1 GENE2_KO
1 GENE1_KO_ChIP_1 WT
2 GENE1_KO_ChIP_2 GENE2_KO
3 GENE1_KO_ChIP_2 WT
4 GENE1_KO_ChIP_3 GENE2_KO
5 GENE1_KO_ChIP_3 WT
6 GENE2_KO_ChIP_1 GENE1_KO
7 GENE2_KO_ChIP_1 WT
8 GENE2_KO_ChIP_2 GENE1_KO
9 GENE2_KO_ChIP_2 WT
10 GENE2_KO_ChIP_3 GENE1_KO
11 GENE2_KO_ChIP_3 WT
12 WT_ChIP_1 GENE1_KO
13 WT_ChIP_1 GENE2_KO
14 WT_ChIP_2 GENE1_KO
15 WT_ChIP_2 GENE2_KO
16 WT_ChIP_3 GENE1_KO
17 WT_ChIP_3 GENE2_KO"""
return pd.read_table(StringIO(contents), sep="\s+", index_col=0)
def main():
uri, outfile, dataset = get_arguments()
fd = tempfile.NamedTemporaryFile()
progress = ProgressBar(widgets=[Percentage(), ' ', Bar(), ' ', ETA(), ' ', FileTransferSpeed()])
def update(count, blockSize, totalSize):
if progress.maxval is None:
progress.maxval = totalSize
progress.start()
progress.update(min(count * blockSize, totalSize))
urllib.urlretrieve(uri, fd.name, reporthook = update)
if dataset == 'zinc12':
df = pandas.read_csv(fd.name, delimiter = '\t')
df = df.rename(columns={'SMILES':'structure'})
df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
elif dataset == 'chembl22':
df = pandas.read_table(fd.name,compression='gzip')
df = df.rename(columns={'canonical_smiles':'structure'})
df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
pass
else:
df = pandas.read_csv(fd.name, delimiter = '\t')
df.to_hdf(outfile, 'table', format = 'table', data_columns = True)
def add_node_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"):
"""
inFile - pedigree as .txt file
pedgraph - Pedigree as a networkX graph object
animal - column for the animal ID
atCol - column for the attribute
atName - name for the attribute
"""
ped_df = pd.read_table(inFile, header=None, delim_whitespace=True)
#print ped_df
dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1]))
#print dic_ped
correct_dic_ped = {str(k):int(v) for k,v in dic_ped.items()}
#print correct_dic_ped
for node, value in dic_ped.items():
pedgraph.node[str(node)]["EBV"] = value
return correct_dic_ped
def add_ebv_attribute(inFile, pedgraph, animal=1, atCol=4, atName="attr1"):
"""
inFile - pedigree as .txt file
pedgraph - Pedigree as a networkX graph object
animal - column for the animal ID
atCol - column for the attribute
atName - name for the attribute
"""
ped_df = pd.read_table(inFile, header=None, delim_whitespace=True)
#print ped_df
dic_ped = dict(zip(ped_df[animal - 1], ped_df[atCol - 1]))
#print dic_ped
correct_dic_ped = {str(k):int(-v) for k,v in dic_ped.items()}
#print correct_dic_ped
for node, value in dic_ped.items():
pedgraph.node[str(node)]["EBV"] = value
return correct_dic_ped
def _load_knownGene(filename):
""" Load UCSC knownGene table.
Parameters
----------
filename : str
path to knownGene file
Returns
-------
df : pandas.DataFrame
knownGene table if loading was successful, else None
"""
if filename is None:
return None
try:
df = pd.read_table(filename, names=['name', 'chrom', 'strand', 'txStart', 'txEnd',
'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts',
'exonEnds', 'proteinID', 'alignID'], index_col=0)
df['chrom'] = df['chrom'].str[3:]
return df
except Exception as err:
print(err)
return None
def _load_kgXref(filename):
""" Load UCSC kgXref table.
Parameters
----------
filename : str
path to kgXref file
Returns
-------
df : pandas.DataFrame
kgXref table if loading was successful, else None
"""
if filename is None:
return None
try:
df = pd.read_table(filename, names=['kgID', 'mRNA', 'spID', 'spDisplayID',
'geneSymbol', 'refseq', 'protAcc',
'description', 'rfamAcc', 'tRnaName'], index_col=0,
dtype=object)
return df
except Exception as err:
print(err)
return None
def on_clipboard(self, button):
kwargs = dict()
text = self.clipboard.wait_for_text()
lines = text[:10000].split('\n')[:-1][:10]
counts = set([x.lstrip().count('\t') for x in lines])
if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
kwargs['sep'] = '\t'
if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None:
kwargs['sep'] = '\s+'
try:
self.data = pd.read_table(StringIO(text), **kwargs)
except:
print("Unexpected Error: ", sys.exc_info())
else:
self.verticalbox.remove(self.scrollable_treelist)
self.add_treeview()
def read_data(filename):
""" Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
and a sparse matrix of artist/user/playcount """
# read in triples of user/artist/playcount from the input dataset
# get a model based off the input params
start = time.time()
logging.debug("reading data from %s", filename)
data = pandas.read_table(filename,
usecols=[0, 2, 3],
names=['user', 'artist', 'plays'])
# map each artist and user to a unique numeric value
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")
# create a sparse matrix of all the users/plays
plays = coo_matrix((data['plays'].astype(numpy.float32),
(data['artist'].cat.codes.copy(),
data['user'].cat.codes.copy())))
logging.debug("read data file in %s", time.time() - start)
return data, plays
def main():
# read and preprocess the movie data
movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
movie = movie_preprocessing(movie)
# read the ratings data and merge it with movie data
rating = pd.read_table("ratings.dat", sep="::",
names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
data = pd.merge(rating, movie, on="movie_id")
# extract feature from our data set
streaming_batch, user_feature, actions, reward_list = feature_extraction(data)
streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False)
user_feature.to_csv("user_feature.csv", sep='\t')
pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False)
reward_list.to_csv("reward_list.csv", sep='\t', index=False)
action_context = movie[movie['movie_id'].isin(actions)]
action_context.to_csv("action_context.csv", sep='\t', index = False)
def get_dataframe_list(args, data_fields=('gene', 'raw_counts')):
# get a list of dataframes
dfs, files = [], args['files'] or []
# create an index using the filenames
# this will prevent having an overlong command line for 100's or 1000's of files
if args['file_index']:
with open(args['file_index']) as fp:
files.extend(fp.readlines())
files = sorted(filter(None, set([f.strip() for f in files])))
# now iterate over the files and get the looooong list of dataframes
for f in files:
# Get only specific columns with usecols
df = pd.read_table(f, usecols=data_fields)
dfs.append(df)
return dfs, files # a list of dataframes and the files index
def _load_table(self, filepath):
"""
Load table from file system.
:param str filepath: Path to table in CSV, TSV, XLSX or
Pandas pickle format.
:return: Pandas table
:rtype: pandas.core.frame.DataFrame
"""
_, ext = os.path.splitext(filepath.lower())
if ext == '.tsv':
return pd.read_table(filepath, **self.kwargs)
if ext == '.csv':
return pd.read_csv(filepath, **self.kwargs)
if ext == '.xlsx':
return pd.read_excel(filepath, **self.kwargs)
return pd.read_pickle(filepath, **self.kwargs)
def years(self):
df_list=[]
k=[str(i) for i in range(1,13)]
print k
j=[i for i in range(1,13)]
result=[]
for i in range(1,13):
filename='2016-%s.xls' %str(i).zfill(2)
#print filename
t=pd.read_table(filename,encoding='gbk',dtype={u'????':np.str})
fee=t[u'???'].sum()+t[u'???'].sum()+t[u'????'].sum()
print i," fee: "
print fee
df_list.append(t)
result.append(fee)
df=pd.concat(df_list,keys=k)
#print df
#df.to_excel('2016_delivery_order.xls')
self.caculation(df)
plt.plot(j,result)
plt.show()
def __loadPar( self, parname ):
"""
Frealign files normally have 16 columns, with any number of comment lines that start with 'C'
"""
# Ergh, cannot have trailing comments with np.loadtxt?
self.parCol = [b"N", b"PSI", b"THETA", b"PHI", b"SHX", b"SHY", b"MAG", b"FILM", b"DF1", b"DF2", \
b"ANGAST", b"OCC", b"LogP", b"SIGMA", b"SCORE", b"CHANGE" ]
self.par = pandas.read_table( parname, engine='c', sep=' ', header=None, names =self.parCol, quotechar='C' )
#self.par.append( np.loadtxt( parname, comments=b'C' ) )
# TODO: split into a dictionary?
# TODO: read comments as well
# TODO: use pandas instead?
#self.parCol = {b"N":0, b"PSI":1, b"THETA":2, b"PHI":3, b"SHX":4, b"SHY":5, b"MAG":6, b"FILM":7, b"DF1":8, b"DF2":9,
# b"ANGAST":10, b"OCC":11, b"LogP":12, b"SIGMA":13, b"SCORE":14, b"CHANGE":15 }
#self.parComments = np.loadtxt( parname, comments=b' ' )
def read_cufflinks(sample_path, isoforms=False):
''' Function for reading a Cufflinks quantification result.
Returns
-------
A pandas.Series with the expression values in the sample.
'''
if isoforms:
quant_file = sample_path + '/isoforms.fpkm_tracking'
else:
quant_file = sample_path + '/genes.fpkm_tracking'
df = pd.read_table(quant_file, engine='c',
usecols=['tracking_id', 'FPKM'],
index_col=0,
dtype={'tracking_id': np.str, 'FPKM': np.float64})
df['tracking_id'] = df.index
df = df.groupby('tracking_id').sum()
df['TPM'] = df['FPKM'] / df['FPKM'].sum() * 1e6
df = df.rename(columns={'tracking_id': 'target_id'})
return df['TPM']
def getFeaturesForGenome(genomeId, CDS_ONLY):
"""
This method gets the features for a particular genomeId frfom PATRIC
Parameters
genomeId: UniqueId for the genome
CDS_ONLY: retrieve only CDS features
"""
data_table = pd.read_table(PatricURL
+genomeId+'/'+genomeId+'.PATRIC.features.tab')
print data_table.shape
if CDS_ONLY:
return data_table[(data_table.feature_type == 'CDS')]
else:
return data_table
def input_data():
contents = u"""Chromosome Bin End examples/test.bed
chr1 887600 887799 0
chr1 994600 994799 0
chr1 1041000 1041199 0
chr1 1325200 1325399 1
chr1 1541600 1541799 1
chr1 1599000 1599199 1
chr1 1770200 1770399 0
chr1 1820200 1820399 1
chr1 1995000 1995199 0
chr1 2063800 2063999 0
chr1 2129400 2129599 0
chr1 2239000 2239199 0
chr1 2318800 2318999 0
chr1 2448200 2448399 1
chr1 3006000 3006199 0
chr1 3046000 3046199 1
chr1 3089200 3089399 0
chr1 3093800 3093999 0
chr1 3096400 3096599 0"""
return pd.read_table(StringIO(contents), sep="\s+", index_col=[0, 1, 2])
def expected_result():
c = u"""Bin Chromosome ooo
887600 chr1 1
994600 chr1 1
1041000 chr1 1
1770200 chr1 1
1770400 chr1 1
1995000 chr1 1
2063800 chr1 1
2064000 chr1 1
2129200 chr1 1
2239000 chr1 1
2318800 chr1 1
3006000 chr1 1"""
return pd.read_table(StringIO(c), sep="\s+", index_col=[1, 0])
def expected_result(input_bed_file):
df = pd.read_table(
StringIO(u"""Count Chromosome Bin
2 chr1 39036800
1 chr1 73781000
1 chr1 90059800
1 chr3 55648200
1 chr7 20246600
1 chr7 91135000
1 chr13 100938400
1 chr19 43528800
1 chr19 47108800"""),
sep=r"\s+",
dtype={"Count": int32,
"Bin": int32})
df.columns = [input_bed_file, "Chromosome", "Bin"]
return df
def read_dfs(files):
full_path = False
if not len(files) == len(set([basename(f) for f in files])):
logging.info("Matrix-files do not have a unique basename. Using full path in header!")
full_path = True
dfs = OrderedDict()
for f in files:
df = pd.read_table(f, header=0, sep=" ", index_col=[0, 1])
df = df[~df.index.duplicated(keep='first')]
columns = list(df.columns)
file_nick = "Enriched_" + basename(f) if not full_path else "Enriched_" + f
columns[0] = file_nick
df.columns = columns
logging.info("Calling " + f + " " + file_nick + " in matrix file.")
dfs[f] = df
return dfs
def main(self, name, opts):
logging.basicConfig(filename=opts.log_file,
format='%(levelname)s (%(asctime)s): %(message)s')
log = logging.getLogger(name)
if opts.verbose:
log.setLevel(logging.DEBUG)
else:
log.setLevel(logging.INFO)
lc = []
for split, filename in zip(['train', 'val'],
[opts.train_lc, opts.val_lc]):
_lc = pd.read_table(filename)
_lc['split'] = split
_lc['epoch'] = range(1, len(_lc) + 1)
lc.append(_lc)
lc = pd.concat(lc)
plot = plot_lc(lc, metrics=opts.metrics, outputs=opts.outputs)
plot.savefig(opts.out_file)
log.info('Done!')
return 0
def create_routing_table(bgp=None, ixp_prefixes=None, ixp_asns=None, bgp_compression='infer'):
log.info('Creating IP2AS tool.')
if bgp_compression == 'infer' and bgp.startswith('http'):
bgp_compression = infer_compression(bgp, 'infer')
if not isinstance(ixp_prefixes, pd.DataFrame):
ixp_prefixes = set(pd.read_csv(ixp_prefixes, comment='#', index_col=0).index.unique()) if ixp_prefixes is not None else set()
if not isinstance(ixp_asns, pd.DataFrame):
ixp_asns = set(pd.read_csv(ixp_asns, comment='#', index_col=0).index.unique()) if ixp_asns is not None else set()
if not isinstance(bgp, pd.DataFrame):
bgp_original = pd.read_table(bgp, comment='#', names=['Address', 'Prefixlen', 'ASN'], compression=bgp_compression)
bgp = bgp_original[~bgp_original.ASN.str.contains(',|_')].copy()
bgp['ASN'] = pd.to_numeric(bgp.ASN)
rt = RoutingTable()
for address, prefixlen, asn in bgp[~bgp.ASN.isin(ixp_asns)].itertuples(index=False):
rt.add_prefix(asn.item(), address, prefixlen)
for address, prefixlen, asn in bgp[bgp.ASN.isin(ixp_asns)].itertuples(index=False):
rt.add_ixp(address, prefixlen)
for prefix in ixp_prefixes:
rt.add_ixp(prefix)
rt.add_private()
rt.add_multicast()
rt.add_default()
return rt
def _mag_hires_helper(year, doy, local_dir, url, coords):
fname = str(year)[2:] + doy + '_FGM_' + coords
hdf_fname = '{}_{}.hdf'.format(year, doy)
hdfloc = os.path.join(local_dir, hdf_fname)
if os.path.isfile(hdfloc):
return pd.read_hdf(hdfloc)
f = helper.load(fname + '.TAB', local_dir, url)
if 'error_message' in f.readline():
f.close()
os.remove(os.path.join(local_dir, fname + '.TAB'))
raise RuntimeError(
'No file named {} exits on remote server'.format(fname))
df = pd.read_table(f, names=['Time', 'Bx', 'By', 'Bz'],
delim_whitespace=True,
parse_dates=[0], index_col=0)
if use_hdf:
df.to_hdf(hdfloc, key='data', mode='w')
return df
def get_dataframe_list(args, data_fields=('gene', 'raw_counts')):
# get a list of dataframes
dfs, files = [], args['files'] or []
# create an index using the filenames
# this will prevent having an overlong command line for 100's or 1000's of files
if args['file_index']:
with open(args['file_index']) as fp:
files.extend(fp.readlines())
files = sorted(filter(None, set([f.strip() for f in files])))
# now iterate over the files and get the looooong list of dataframes
for f in files:
# Get only specific columns with usecols
df = pd.read_table(f, usecols=data_fields)
dfs.append(df)
return dfs, files # a list of dataframes and the files index
def to_dataframe(lines, **kwargs):
names = lines.readline().decode('utf-8').strip().split('\t')
types = lines.readline().decode('utf-8').strip().split('\t')
dtypes, parse_dates, converters = {}, [], {}
for name, chtype in zip(names, types):
dtype = CH2PD[chtype]
if dtype == 'object':
converters[name] = decode_escapes
elif dtype.startswith('datetime'):
parse_dates.append(name)
else:
dtypes[name] = dtype
return pd.read_table(lines, header=None, names=names, dtype=dtypes,
parse_dates=parse_dates, converters=converters,
na_values=set(), keep_default_na=False, **kwargs)