def identify_and_tag_DOI(line):
"""takes a single citation line and attempts to locate any DOI references.
DOI references are recognised in both http (url) format and also the
standard DOI notation (DOI: ...)
@param line: (string) the reference line in which to search for DOI's.
@return: the tagged line and a list of DOI strings (if any)
"""
# Used to hold the DOI strings in the citation line
doi_strings = []
# Run the DOI pattern on the line, returning the re.match objects
matched_doi = re_doi.finditer(line)
# For each match found in the line
for match in reversed(list(matched_doi)):
# Store the start and end position
start = match.start()
end = match.end()
# Get the actual DOI string (remove the url part of the doi string)
doi_phrase = match.group('doi')
if '%2f' in doi_phrase.lower():
doi_phrase = unquote(doi_phrase)
# Replace the entire matched doi with a tag
line = line[0:start] + "<cds.DOI />" + line[end:]
# Add the single DOI string to the list of DOI strings
doi_strings.append(doi_phrase)
doi_strings.reverse()
return line, doi_strings
评论列表
文章目录