token_characters_indexer.py 文件源码-python代码片段

def pad_token_sequence(self,
                           tokens: List[List[int]],
                           desired_num_tokens: int,
                           padding_lengths: Dict[str, int]) -> List[List[int]]:
        # Pad the tokens.
        padded_tokens = pad_sequence_to_length(tokens, desired_num_tokens, default_value=self.get_padding_token)

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths['num_token_characters']
        longest_token: List[int] = max(tokens, key=len, default=[])
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return [list(token[:desired_token_length]) for token in padded_tokens]