def calc_word_sim(model, eval_file):
df = pd.read_csv(eval_file, sep=',', header=0) # eval dataset
col1, col2, score = df.columns.values
model_vocab = model.vocab.keys()
ground = []
sys = []
for idx, row in df.iterrows():
if row[col1] in model_vocab and row[col2] in model_vocab:
ground.append(float(row[score]))
sys.append(model.similarity(row[col1], row[col2]))
# compute Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
print sys
# import pdb;pdb.set_trace()
corr, p_val = stats.spearmanr(sys, ground)
logger.info("# of pairs found: %s / %s" % (len(ground), len(df)))
logger.info("correlation: %s" % corr)
return corr, p_val
python类read_csv()的实例源码
def draw(path, srv):
filename = os.path.join(path, srv["preprocessed_filename"])
df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True)
bins = defaultdict(list)
for i, col in enumerate(df.columns):
serie = df[col].dropna()
if pd.algos.is_monotonic_float64(serie.values, False)[0]:
serie = serie.diff()[1:]
p_value = adfuller(serie, autolag='AIC')[1]
if math.isnan(p_value): continue
nearest = 0.05 * round(p_value/0.05)
bins[nearest].append(serie)
for bin, members in bins.items():
series = [serie.name for serie in members]
if len(members) <= 10:
columns = series
else:
columns = random.sample(series, 10)
subset = df[columns]
name = "%s_adf_confidence_%.2f.png" % (srv["name"], bin)
print(name)
axes = subset.plot(subplots=True)
plt.savefig(os.path.join(path, name))
plt.close("all")
def test_addepar2frame(self):
r = {'meta': {'columns': [{'key': 'node_id', 'display_name': 'Entity ID', 'output_type': 'Word'},
{'key': '_custom_13_custodian_name_166730', 'display_name': '15. Custodian Name', 'output_type': 'Word'},
{'key': '_custom_15_reference_currency_165485', 'display_name': '17. Reference Currency', 'output_type': 'Currency'},
{'key': '_custom_16_lwm_risk_profile_114480', 'display_name': '18. LWM Risk Profile', 'output_type': 'Word'},
{'key': '_custom_23_lwm_aum_type_293536', 'display_name': '23. LWM - AUM Type', 'output_type': 'Word'},
{'key': 'inception_event_date', 'display_name': 'Inception Date', 'output_type': 'Date'}],
'groupings': [{'key': 'top_level_owner', 'display_name': 'Top Level Owner'}]},
'data': {'type': 'portfolio_views', 'attributes':
{'total': {'name': 'Total', 'columns':
{'_custom_15_reference_currency_165485': None, 'inception_event_date': '2013-12-31', '_custom_23_lwm_aum_type_293536': None, '_custom_16_lwm_risk_profile_114480': None, '_custom_13_custodian_name_166730': None, 'node_id': None},
'children': [{'entity_id': 1146188, 'name': 'A', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-10-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'X', 'node_id': 1146188}, 'children': []},
{'entity_id': 1231399, 'name': 'B', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-09-21', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'Y', 'node_id': 1231399}, 'children': []},
{'entity_id': 1511499, 'name': 'C', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2017-03-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Conservative', '_custom_13_custodian_name_166730': 'Z', 'node_id': 1511499}, 'children': []},
]}}, 'links': {'self': '/v1/portfolio_views/null'}}}
pdt.assert_frame_equal(addepar2frame(r), pd.read_csv("/pyaddepar/test/resources/frame.csv", parse_dates=True), check_dtype=False)
def generate_vocabulary(self, review_summary_file):
"""
:param review_summary_file:
:return:
"""
self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values
for review,summary in self.rev_sum_pair:
rev_lst = wordpunct_tokenize(review)
sum_lst = wordpunct_tokenize(summary)
self.__add_list_to_dict(rev_lst)
self.__add_list_to_dict(sum_lst)
# Now store the "" empty string as the last word of the voacabulary
self.map[""] = len(self.map)
self.revmap[len(self.map)] = ""
def plot_csv(stock_data, symbol):
"""
params:
- stock_data(list) : list of dict objects containing stock data
- name(str) : output file name specified by `-output` param.
"""
try:
df = pd.read_csv('{}.csv'.format(symbol))
except:
write_to_csv(stock_data, symbol)
df = pd.read_csv('{}.csv'.format(symbol))
p1 = figure(x_axis_type="datetime", title="Stock Closing Price")
p1.grid.grid_line_alpha = 0.3
p1.xaxis.axis_label = 'Date'
p1.yaxis.axis_label = 'Price'
p1.line(datetime(list(df['date'])), list(df['close']),
color='#A6CEE3', legend=symbol)
output_file("{}.html".format(symbol), title="Stock Closing Prices")
show(p1) # open a browser
def fix_columns(df):
"""
Changes DataFrame in-place
"""
# Convert all string columns to str to avoid a PerformanceWarning
for col in _STRING_COLUMNS:
if col not in df:
continue
df[col].fillna('', inplace=True)
df[col] = df[col].astype('str')
# Empty strings have been set to NaN by read_csv. Replacing
# by the empty string avoids problems with groupby, which
# ignores NaN values.
# Columns that have any NaN values in them cannot be converted to
# int due to a numpy limitation.
for col in _INTEGER_COLUMNS:
if col not in df.columns:
continue
if all(df[col].notnull()):
df[col] = df[col].astype(int)
def main(args):
if args.minimum_frequency is None:
minimum_frequency = max((len(args.tables) + 1) // 2, 2)
else:
minimum_frequency = args.minimum_frequency
logger.info('Minimum frequency set to %s', minimum_frequency)
# Read in tables
tables = []
for path in args.tables:
table = pd.read_csv(path, sep='\t')
table = table[table.database_diff >= args.minimum_db_diff]
table = table.dropna()
tables.append(table)
if len(table) == 0:
logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)
# Count V sequence occurrences
counter = Counter()
for table in tables:
counter.update(set(table.consensus))
# Find most frequent occurrences and print result
print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
for sequence, frequency in counter.most_common():
if frequency < minimum_frequency:
break
names = []
gene = None
for table in tables:
matching_rows = table[table.consensus == sequence]
if matching_rows.empty:
continue
names.extend(matching_rows.name)
if gene is None:
row = matching_rows.iloc[0]
gene = row.gene
database_diff = row.database_diff
#shm = row['V_SHM']
print(frequency, gene, database_diff, sequence, *names, sep='\t')
def count_full_text_occurrences(candidates, table_path, other_gene, other_errors, merge, min_count):
# Use only records that have a chance of reaching the required min_count
records = {info.sequence: info for info in candidates if info.max_count >= min_count}
# Count full-text occurrences in the genomic_sequence, circumventing
# inaccurate IgBLAST alignment boundaries
# TODO limit the search to the gene region (especially for D genes)
# Speed up search by looking for most common sequences first
search_order = sorted(records, key=lambda s: records[s].max_count, reverse=True)
cols = [other_gene, 'V_errors', 'J_errors', 'CDR3_nt', 'genomic_sequence']
for chunk in pd.read_csv(table_path, usecols=cols, chunksize=10000, sep='\t'):
chunk = chunk[chunk[other_errors] == 0]
for row in chunk.itertuples():
for needle in search_order:
if needle in row.genomic_sequence:
record = records[needle]
record.count += 1
record.other_genes.add(getattr(row, other_gene))
record.cdr3s.add(row.CDR3_nt)
if merge:
break
return records.values()
def main(args):
n = 0
first = True
written = 0
stats = FilteringStatistics()
for chunk in pd.read_csv(args.table, chunksize=10000, sep='\t'):
fix_columns(chunk)
n += len(chunk)
filtered, chunk_stats = filtered_table(chunk, v_gene_coverage=args.v_coverage,
j_gene_coverage=args.j_coverage, v_gene_evalue=args.v_evalue)
stats += chunk_stats
print(filtered.to_csv(sep='\t', index=False, header=first), end='')
first = False
written += len(filtered)
logger.info('%s rows in input table', stats.n)
logger.info('%s rows have both V and J assignment', stats.vjassigned)
logger.info('%s of those do not have a stop codon', stats.stop)
logger.info('%s of those have an E-value of at most %s', stats.v_evalue, args.v_evalue)
logger.info('%s of those cover the V gene by at least %s%%', stats.v_coverage, args.v_coverage)
logger.info('%s of those cover the J gene by at least %s%%', stats.j_coverage, args.j_coverage)
logger.info('%d rows written', written)
def get_treasury_data(start_date, end_date):
return pd.read_csv(
"http://www.federalreserve.gov/datadownload/Output.aspx"
"?rel=H15"
"&series=bf17364827e38702b42a58cf8eaa3f78"
"&lastObs="
"&from=" # An unbounded query is ~2x faster than specifying dates.
"&to="
"&filetype=csv"
"&label=omit"
"&layout=seriescolumn"
"&type=package",
skiprows=1, # First row is a useless header.
parse_dates=['Time Period'],
na_values=['ND'], # Presumably this stands for "No Data".
index_col=0,
).loc[
start_date:end_date
].dropna(
how='all'
).rename(
columns=parse_treasury_csv_column
).tz_localize('UTC') * 0.01 # Convert from 2.57% to 0.0257.
def storageindex(self):
#get the filelist
onlyfiles = [ f for f in listdir(self.indexdata) if isfile(join(self.indexdata,f)) ]
#read from using pandas
for f in onlyfiles:
df = pd.read_csv(self.indexdata+"/"+f)
s=f.split('.')
name = s[0][2:8]
records = json.loads(df.T.to_json()).values()
for row in records:
row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d")
print name
self.index[name].insert_many(records)
#storage stock pool into database
def load_names_data():
fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
if not os.path.exists(fp):
r = requests.get(URL_NAMES)
with open(fp, 'wb') as f:
f.write(r.content)
post = collections.OrderedDict()
with zipfile.ZipFile(fp) as zf:
# get ZipInfo instances
for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
fn = zi.filename
if fn.startswith('yob'):
year = int(fn[3:7])
df = pd.read_csv(
zf.open(zi),
header=None,
names=('name', 'gender', 'count'))
df['year'] = year
post[year] = df
df = pd.concat(post.values())
df.set_index('name', inplace=True, drop=True)
return df
def load_names_data():
fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
if not os.path.exists(fp):
r = requests.get(URL_NAMES)
with open(fp, 'wb') as f:
f.write(r.content)
post = collections.OrderedDict()
with zipfile.ZipFile(fp) as zf:
# get ZipInfo instances
for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
fn = zi.filename
if fn.startswith('yob'):
year = int(fn[3:7])
df = pd.read_csv(
zf.open(zi),
header=None,
names=('name', 'gender', 'count'))
df['year'] = year
post[year] = df
df = pd.concat(post.values())
df.set_index('name', inplace=True, drop=True)
return df
def read_data(fname):
""" Read football-data.co.uk csv """
data = (
pd.read_csv(fname)
.rename(columns={
'HomeTeam': 'home_team',
'AwayTeam': 'away_team',
'FTHG': 'home_goals',
'FTAG': 'away_goals'
})
.loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games
)
team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
data['home_team_id'] = data['home_team'].replace(team_map)
data['away_team_id'] = data['away_team'].replace(team_map)
for col in ('home_goals', 'away_goals'):
data[col] = [int(c) for c in data[col]]
return data, team_map
def cluster_map_sheet_pre():
print("------ load cluster_map data ----------")
cluster_map_sheet_path = os.path.join(LOAD_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR)
print("load data from: ", cluster_map_sheet_path)
save_path = os.path.join(SAVE_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR)
print("save data to: ", save_path)
file = "cluster_map"
cluster_sheet = os.path.join(cluster_map_sheet_path, file)
data = pd.read_csv(cluster_sheet,header=-1)
data.columns = ["raw"]
data["district_hash"] = data["raw"].map(lambda x: x.split("\t")[0])
data["district_map"] = data['raw'].map(lambda x: x.split("\t")[1])
del data["raw"]
save_df_to_file(data, save_path, file)
# handle the order_info sheet
def create_hash_district_map_dict():
file = "cluster_map.csv"
district_hash_map_path = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, file)
hash_data = pd.read_csv(district_hash_map_path)
## convert the dataframe into dict
hash_map_rule = dict(zip(hash_data.district_hash, hash_data.district_map))
# print(type(hash_map_rule))
saved_file = "cluster_map.pickle"
map_save_file = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, saved_file)
## save into same dir as file
with open(map_save_file, "wb") as f:
pickle.dump(hash_map_rule, f)
#print(hash_map_rule)
# map the district features in the input data_frame into value
def test_prepare_dataset(self, fetch, chamber_of_deputies):
"""
* Rename columns.
* Make `document_type` a category column.
* Rename values for `category`.
* Create `is_party_expense` column.
"""
dataset = self.subject.dataset
self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns)))
document_types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad']
self.assertEqual(document_types,
dataset['document_type'].cat.categories.tolist())
fixture = pd.read_csv(os.path.join(self.fixtures_path, 'reimbursements.xz'))
meal_rows = fixture \
.query('subquota_description == "Congressperson meal"').index
self.assertEqual(['Meal'],
dataset.loc[meal_rows, 'category'].unique().tolist())
party_expense_rows = fixture[fixture['congressperson_id'].isnull()].index
self.assertEqual([True],
dataset.loc[party_expense_rows, 'is_party_expense'].unique().tolist())
def _load_sets(self):
print("Loading datasets")
train_patients = pd.read_csv("data/stage1/" + "stage1_labels.csv")
for idx, row in train_patients.iterrows():
if self._check_sample_exists(row['id']):
self._test_set.append(row['id'])
for idx, row in train_patients.iterrows():
if self._check_sample_exists(row['id']):
self._train_set.append([row['id'], row['cancer']])
#Create permutation for random loading
self.shuffle()
print("Loading datasets: Done!")
def _load_sets(self):
print("Loading datasets")
train_patients = pd.read_csv(os.path.join(self._directory, "stage1_labels.csv"))
test_patients = pd.read_csv(os.path.join(self._directory, "stage1_sample_submission.csv"))
for idx, row in test_patients.iterrows():
self._test_set.append(row['id'])
for idx, row in train_patients.iterrows():
self._train_set.append([row['id'], row['cancer']])
#Create permutation for random loading
self.shuffle()
print("Loading datasets: Done!")
def doctable(ctx):
df = pd.read_csv('./docs/flight-options.csv')
# open an existing document
doc = docx.Document('./docs/style-reference.docx')
as_int = partial(format_decimal, format='#')
as_usd = partial(format_currency, currency='USD')
s = doc.sections[0]
width = s.page_width - s.left_margin - s.right_margin
doc.add_picture('./docs/diagrams_002.png', width=width)
formatters = {
'ticket_price': as_usd,
'total_hours': as_int,
'trip': as_int,
'airline': partial(shorten_long_name, width=20),
'selected': compose({0: 'No', 1: 'Yes'}.get, int)
}
add_table(df, doc, table_style='Plain Table 3', formatters=formatters)
# save the doc
doc.save('./docs/test.docx')
def eval(flags):
name = flags.pred_path
yp = pd.read_csv(name)
classes = len([i for i in yp.columns.values if 'class' in i])
yp = yp[['class%d'%i for i in range(1,classes+1)]].values
myDB = personalDB(flags,name="full")
if "stage1" in name:
y=myDB.data['test_variants_filter']['Class']-1
else:
myDB.get_split()
va = myDB.split[flags.fold][1]
y = np.argmax(myDB.y[va],axis=1)
if np.max(y)>classes:
y = np.argmax(to4c(onehot_encode(y)),axis=1)
score = cross_entropy(y,yp)
print(name,score,'\n')
def eval(name,clip=False,bar=0.9):
base = pd.read_csv('../input/stage1_solution_filtered.csv')
base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1)
sub = pd.read_csv(name)
#sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right')
#print(sub.head())
y = base['Class'].values
yp = sub[['class%d'%i for i in range(1,10)]].values
if clip:
yp = np.clip(yp,(1.0-bar)/8,bar)
yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1])
print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp))
for i in range(9):
y1 = y[y==i]
yp1 = yp[y==i]
print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
def post(self):
if self.flags.task == "test_cnn_stage1":
docs = self.DB.clean_doc['test_text_filter']
elif self.flags.task == "test_cnn_stage2":
docs = self.DB.clean_doc['stage2_test_text']
else:
self.mDB.get_split()
docs = self.mDB.split[self.flags.fold][1]
nrows = len(docs)
p = np.zeros([nrows,9])
for i in range(self.flags.epochs):
if i==0:
skiprows=None
else:
skiprows = nrows*i
p = p + (pd.read_csv(self.flags.pred_path,header=None,nrows=nrows,skiprows=skiprows).values)
p = p/self.flags.epochs
if '_cv' in self.flags.task:
from utils.np_utils.utils import cross_entropy
y = np.argmax(self.mDB.y,axis=1)
print("cross entropy", cross_entropy(y[self.mDB.split[self.flags.fold][1]],p))
s = pd.DataFrame(p,columns=['class%d'%i for i in range(1,10)])
s['ID'] = np.arange(nrows)+1
s.to_csv(self.flags.pred_path.replace(".csv","_sub.csv"),index=False,float_format="%.5f")
def post_cv(flags):
import re
import os
path = flags.data_path
files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
s = []
for name in files:
s.append(pd.read_csv("%s/%s"%(path,name)))
s = pd.concat(s,axis=0)
print(s.head())
classes = len([i for i in s.columns.values if 'class' in i])
from utils.np_utils.utils import cross_entropy
yp = s[['class%d'%i for i in range(1,classes+1)]].values
y=s['real'].values
print(cross_entropy(y,yp))
s.to_csv("%s/cv.csv"%path,index=False)
def replace(s,n):
seen = pd.read_csv(s)
unseen = pd.read_csv(n)
te = pd.read_csv('../input/stage2_test_variants.csv')
tr = pd.read_csv('../input/training_variants')
unseen = pd.merge(unseen,te,on='ID',how='right')
seen = pd.merge(seen,te,on='ID',how='right')
mask = seen.Gene.isin(tr.Gene)
cols = ['class%d'%i for i in range(1,10)]
seen.loc[~mask,cols] = 0
mask = unseen.Gene.isin(tr.Gene)
unseen.loc[mask,cols] = 0
assert (unseen['ID']==seen['ID']).all()
seen[cols] = seen[cols] + unseen[cols]
seen[cols+['ID']].to_csv('mix.csv',index=False)
def test2():
s1 = pd.read_csv('../input/test_variants')
s3 = pd.read_csv('../input/test_variants_filter')
s1 = pd.merge(s1,s3[['ID','Class']],on='ID',how='left').fillna(1)
s2 = pd.read_csv('../input/stage2_test_variants.csv')
s1 = pd.merge(s1,s2,on= ["Gene", "Variation"],how='inner')
s1['ID'] = s1['ID_y']
s2 = pd.merge(s1[['ID','Class']],s2,on='ID',how='right').fillna(1)
yp = onehot_encode(s2['Class'].values-1)
for i in range(1,10):
s2['class%d'%i] = yp[:,i-1]
cols = ['class%d'%i for i in range(1,10)]
mask = s2['ID'].isin(s1['ID_y'])
s2.loc[~mask,cols] = 0.1
s2['ID'] = s2['ID'].astype(int)
cols = ['ID']+['class%d'%i for i in range(1,10)]
s2[cols].to_csv('sub.csv',index=False)
def x_label(feature_path, pred=False):
X_list = []
for each in feature_path:
X = pd.read_csv(feature_paths.format(str(each)))
X_list.append(X)
X = pd.DataFrame(pd.concat(X_list, axis=0)).reset_index().drop('index', axis=1)
if not pred:
y = X[power_consumption].tolist()
X = X.drop([record_date, user_id, power_consumption], axis=1)
columns = X.columns
X = X.values
return X, y, columns
else:
X = X.drop([record_date, user_id], axis=1)
columns = X.columns
X = X.values
return X, columns
def neighbors():
"""
Read the neighbors for each country.
"""
neighbors_csv = pd.read_csv(csv_path("mledoze-countries.csv"), sep=';',
usecols=[4, 17])
neighbors_csv.columns = ["Code", "neighbors"]
neighbors_csv["neighbors"] = neighbors_csv["neighbors"].str.split(',')
for row in neighbors_csv.loc[neighbors_csv.neighbors.isnull(), 'neighbors'].index:
neighbors_csv.at[row, 'neighbors'] = []
# Island nations are a weird exception
neighbors_csv.loc[neighbors_csv.Code == "MDG", "neighbors"] = [["MOZ", "ZAF", "TZA"]]
neighbors_csv.loc[neighbors_csv.Code == "TWN", "neighbors"] = [["CHN", "PHL"]]
neighbors_csv.loc[neighbors_csv.Code == "AUS", "neighbors"] = [["NZL"]]
neighbors_csv.loc[neighbors_csv.Code == "NZL", "neighbors"] = [["AUS"]]
neighbors_csv.loc[neighbors_csv.Code == "JPN", "neighbors"] = [["TWN", "KOR", "PHL"]]
neighbors_csv.loc[neighbors_csv.Code == "PHL", "neighbors"] = [["TWN", "KOR", "JPN"]]
neighbors_csv.loc[neighbors_csv.Code == "PRI", "neighbors"] = [["DOM"]]
neighbors_csv.loc[neighbors_csv.Code == "SGP", "neighbors"] = [["MYS", "IDN"]]
neighbors_csv.loc[neighbors_csv.Code == "JAM", "neighbors"] = [["CUB", "DOM"]]
return neighbors_csv
def loadFile(fileName):
# checkFileName??? ??, ??? ???? ???? ??? ??
outputFileName = checkFileName(fileName)
if outputFileName is not -1:
df = pandas.read_csv(outputFileName)
content = df["Content"]
title = df["Title"]
company = df["Company"]
print(company)
print("csv FIle Load Success")
else:
print("Error csv File")
# checkFileName ??
# ???? ??? ???? ???? ??? -1 ??, ??? ??? ??
# ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ??
# ??? ???? csv ??? ??
def loadFile(fileName,analyzeValue):
# checkFileName??? ??, ??? ???? ???? ??? ??
outputFileName = checkFileName(fileName)
if outputFileName is not -1:
df = pandas.read_csv(outputFileName)
content = df["Content"]
title = df["Title"]
company = df["Company"]
print("csv FIle Load Success")
if analyzeValue==1:
# analyze(title)
analyze(content)
else:
print("Error csv File")
# checkFileName ??
# ???? ??? ???? ???? ??? -1 ??, ??? ??? ??
# ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ??
# ??? ???? csv ??? ??