def chardet_dammit(s):
return cchardet.detect(s)['encoding']
python类detect()的实例源码
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def write2file(item_parts): # ???????
for i,items in enumerate(item_parts):
s=items[1] # ???
# print cchardet.detect(s)
# print len(items)
f=open(u'%s.txt' %(s),'w')
list0=[]
for item in items[0]:
# item=item[0]
list0.append('%s\t%s\n' %(item.filename.decode('GB18030'),item.text))
f.write('\n'.join(list0))
f.close()
# ???????2?
def cluster_process(filenames,key_part,s,n_clusters=2):
documents=[]
texts=[]
for fname in filenames:
# key_part: # ?? ???# ?? ???# ???# ???# ???# ???
# # ?? ???# ?? ???# ???# ???# ??
# = = = ?? = = = = = = ?? = = = = = = ?? = = =
# key_part=['# ??']
# ??key_part??
d=extract(fname,key_part=key_part)
documents.append(d)
# documents,words=tfidf(documents)
# print len(documents),len(words)
# docs=create_format_mat(documents,words)
docs=get_tfidf(documents) # ??gensim??tfidf
# ??
# labels [0,1,0,1,1,...]
labels,score=clustering(docs,n_clusters)
print 'key_part:','_'.join(key_part).decode('utf-8')
item_parts=[]
filename_parts=[]
for i in range(n_clusters):
# item=[filenames[j] for j in range(len(labels)) if labels[j]==i]
item=[documents[j] for j in range(len(labels)) if labels[j]==i]
# print cchardet.detect(s)
# ?????
filename_parts.append(([filenames[j] for j in range(len(labels)) if labels[j]==i],u'%s_%s_%d' %(s,'_'.join(key_part),i)))
item_parts.append((item,u'%s_%s_%d' %(s,'_'.join(key_part),i)))
print 'class_%d:%d' %(i,len(item))
# ?????
print 'score:',score
print '-'*20
write2file(item_parts) # ????
return filename_parts
def get_text(f_path,filename):
# global source_path
lines=[]
with open(f_path+os.sep+filename,'r') as f:
for line in f:
line=line.strip()
if line.strip()=='':
line='None'
# print cchardet.detect(line)
lines.append(line)
# print len(lines)
return lines
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
def chardet_dammit(s):
return cchardet.detect(s)['encoding']