tag.py 文件源码-python代码片段

def identify_and_tag_DOI(line):
    """takes a single citation line and attempts to locate any DOI references.
       DOI references are recognised in both http (url) format and also the
       standard DOI notation (DOI: ...)
       @param line: (string) the reference line in which to search for DOI's.
       @return: the tagged line and a list of DOI strings (if any)
    """
    # Used to hold the DOI strings in the citation line
    doi_strings = []

    # Run the DOI pattern on the line, returning the re.match objects
    matched_doi = re_doi.finditer(line)
    # For each match found in the line
    for match in reversed(list(matched_doi)):
        # Store the start and end position
        start = match.start()
        end = match.end()
        # Get the actual DOI string (remove the url part of the doi string)
        doi_phrase = match.group('doi')
        if '%2f' in doi_phrase.lower():
            doi_phrase = unquote(doi_phrase)

        # Replace the entire matched doi with a tag
        line = line[0:start] + "<cds.DOI />" + line[end:]
        # Add the single DOI string to the list of DOI strings
        doi_strings.append(doi_phrase)

    doi_strings.reverse()
    return line, doi_strings