def downloadAllJournalArticles(skip = False, dontskip = ""):
# download journal articles from a list of journal names
if dontskip != "":
skip = True
f = io.open("../compsci_journals.txt") # all Elsevier CS journal names
for l in f:
if len(l) > 2:
j = l.strip("\n")
# skip the ones for which there are already folders, we assume for those downloading has finished
if skip == True:
if j.lower().replace(" ", "_") in os.listdir("../elsevier_papers_xml"):
if not j == dontskip:
print("Skipping journal:", j)
continue
print("Downloading articles for journal:", j)
jurl = getJournalURL(j)
downloadArticles("../elsevier_papers_xml/" + j.lower().replace(" ", "_") + "/", jurl)
python类open()的实例源码
sciencedirect_collect.py 文件源码
项目:scientific-paper-summarisation
作者: EdCo95
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def run_script(self, script_name, namespace):
script = 'scripts/' + script_name
if not self.has_metadata(script):
raise ResolutionError("No script named %r" % script_name)
script_text = self.get_metadata(script).replace('\r\n', '\n')
script_text = script_text.replace('\r', '\n')
script_filename = self._fn(self.egg_info, script)
namespace['__file__'] = script_filename
if os.path.exists(script_filename):
source = open(script_filename).read()
code = compile(source, script_filename, 'exec')
exec(code, namespace, namespace)
else:
from linecache import cache
cache[script_filename] = (
len(script_text), 0, script_text.split('\n'), script_filename
)
script_code = compile(script_text, script_filename, 'exec')
exec(script_code, namespace, namespace)
def whitelist():
with io.open('static/whitelist.txt', 'r', encoding="utf-8") as f: #gets the contents of whitelist.txt so they can be displayed
data = f.read().replace('@', ' [at] ').replace('.', ' [dot] ')
return render_template('whitelist.html',data=data)
def tail():
if request.method == 'POST':
fi = request.form['file']
if os.path.isfile(fi):
n = int(request.form['n'])
le = io.open(fi, 'r', encoding='utf-8')
taildata = le.read()[-n:]
le.close()
else:
taildata = "No such file."
return render_template('tail.html',taildata=taildata)
def wladd():
if request.method == 'POST':
addr = request.form['addr'].lstrip().rstrip()
f = io.open('static/whitelist.txt', 'a', encoding="utf-8")
f.write(addr.decode('utf-8') + u'\r\n')
f.close()
return render_template('wladd.html')
def unsub():
if request.method == 'POST':
addr = request.form['addr'].lstrip().rstrip()
f = io.open('unsubscribers.txt', 'a', encoding="utf-8")
f.write(addr.decode('utf-8') + u'\r\n')
f.close()
f = io.open('static/whitelist.txt', 'r', encoding="utf-8")
lines = f.readlines()
f.close()
f = io.open('static/whitelist.txt', 'w', encoding="utf-8")
for line in lines:
if addr not in line:
f.write(line.decode('utf-8'))
f.close()
return render_template('unsubscribed.html',addr=addr)
def save(self, filename):
info_dict = {
"tokens":self.tokens,
"strings":self.strings,
"s2t":dict(self.s2t),
"i2t":dict(self.i2t),
"unk":self.unk,
"START_TOK":self.START_TOK,
"END_TOK":self.END_TOK
}
with open(filename, "w") as f: pickle.dump(info_dict, f)
def load(cls, filename):
with open(filename, "r") as f:
info_dict = pickle.load(f)
v = Vocab()
v.tokens = info_dict["tokens"]
v.strings = info_dict["strings"]
v.unk = info_dict["unk"]
v.START_TOK = info_dict["START_TOK"]
v.END_TOK = info_dict["END_TOK"]
defaultf = (lambda :v.unk) if (v.unk is not None) else Token.not_found
v.s2t = defaultdict(defaultf, info_dict["s2t"])
v.i2t = defaultdict(defaultf, info_dict["i2t"])
return v
def parse_init():
with open(os.path.join(HERE, PKG_NAME, '__init__.py')) as f:
file_data = f.read()
return [regex.search(file_data).group(2) for regex in
(AUTHOR, DOCSTRING, VERSION)]
def read(*filenames, **kwargs):
encoding = kwargs.get('encoding', 'utf-8')
sep = kwargs.get('sep', '\n')
buf = []
for filename in filenames:
with io.open(filename, encoding=encoding) as f:
buf.append(f.read())
return sep.join(buf)
postprocess_toc_yml.py 文件源码
项目:azure-docs-sdk-python
作者: MicrosoftDocs
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def rewrite_yml(data):
with io.open('toc.yml', 'w', encoding='utf8') as outfile:
yaml.dump(data, outfile, default_flow_style=False, allow_unicode=True)
def embeddings_to_dict(filename):
'''
:param filename: the file name of the word embeddings | file is assumed
to follow this format: "word[tab]dimension 1[space]dimension 2[space]...[space]dimension 50"
:return: a dictionary with keys that are words and values that are the embedding of a word
'''
with io.open(filename, 'r', encoding='utf-8') as f:
word_vecs = {}
for line in f:
line = line.strip('\n').split()
word_vecs[line[0]] = np.array([float(s) for s in line[1:]])
return word_vecs
def load_test_fixture(fixture_path):
path = os.path.dirname(os.path.abspath(__file__))
fixture_file = open(path + '/' + fixture_path)
input = fixture_file.read()
fixture_file.close()
sys.stdin = StringIO(input)
sys.stdout = StringIO()
def load_data(file: str):
with io.open(os.path.join(__abspath__, 'test_data', file)) as afile:
input_str = afile.read().replace('PATH', os.path.join(__abspath__, 'test_data'))
sys.stdin = io.StringIO(input_str)
sys.stdout = io.StringIO()
def run(path, quiet=False):
"""
Downloads all available hash files to a given path.
:param path: Path to download directory
:param quiet: If set to True, no progressbar is displayed
"""
if os.path.isdir(path):
session = requests.Session()
session.headers = {'User-agent': 'Mozilla/5.0 Chrome/57.0.2987.110'}
max_num = max(list(map(int, re.sub(r'[\<\>]',
'',
'\n'.join(re.findall(r'\>[1-9][0-9]{2}\<',
session.get('https://virusshare.com/hashes.4n6').text
)
)
).split('\n')
)
)
)
if not quiet:
p = progressbar.ProgressBar(max_value=max_num)
for i in range(max_num):
filename = str(i).zfill(3) + '.md5'
if os.path.exists(os.path.join(path, filename)):
continue
if not quiet:
p.update(i)
url = URL + filename
head = session.head(url)
if head.status_code == 200:
body = session.get(url, stream=True)
with io.open(os.path.join(path, str(i).zfill(3) + '.md5'), mode='wb') as afile:
for chunk in body.iter_content(chunk_size=1024):
afile.write(b'' + chunk)
body.close()
else:
print('Given path is not a directory.')
sys.exit(1)
def run(self):
searchhash = ''
if self.data_type == 'hash':
searchhash = self.getData()
if len(searchhash) != 32:
self.report({'isonvs': 'unknown',
'hash': searchhash})
elif self.data_type == 'file':
filepath = self.getParam('file')
hasher = hashlib.md5()
with io.open(filepath, mode='rb') as afile:
for chunk in iter(lambda: afile.read(65536), b''):
hasher.update(chunk)
searchhash = hasher.hexdigest()
else:
self.error('Unsupported data type.')
# Read files
for file in self.filelist:
filepath = os.path.join(self.path, file)
if not os.path.isfile(filepath):
continue
with io.open(filepath, 'r') as afile:
for line in afile:
# Skipping comments
if line[0] == '#':
continue
if searchhash.lower() in line:
self.report({'isonvs': True,
'md5': searchhash})
self.report({'isonvs': False,
'md5': searchhash})
def readme():
import io
with io.open('README.rst', "r", encoding="utf-8") as f:
long_description = f.read()
return long_description
def run_script(self, script_name, namespace):
script = 'scripts/' + script_name
if not self.has_metadata(script):
raise ResolutionError("No script named %r" % script_name)
script_text = self.get_metadata(script).replace('\r\n', '\n')
script_text = script_text.replace('\r', '\n')
script_filename = self._fn(self.egg_info, script)
namespace['__file__'] = script_filename
if os.path.exists(script_filename):
source = open(script_filename).read()
code = compile(source, script_filename, 'exec')
exec(code, namespace, namespace)
else:
from linecache import cache
cache[script_filename] = (
len(script_text), 0, script_text.split('\n'), script_filename
)
script_code = compile(script_text, script_filename, 'exec')
exec(script_code, namespace, namespace)