def pad_token_sequence(self,
tokens: List[List[int]],
desired_num_tokens: int,
padding_lengths: Dict[str, int]) -> List[List[int]]:
# Pad the tokens.
padded_tokens = pad_sequence_to_length(tokens, desired_num_tokens, default_value=self.get_padding_token)
# Pad the characters within the tokens.
desired_token_length = padding_lengths['num_token_characters']
longest_token: List[int] = max(tokens, key=len, default=[])
padding_value = 0
if desired_token_length > len(longest_token):
# Since we want to pad to greater than the longest token, we add a
# "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
padded_tokens.append([padding_value] * desired_token_length)
# pad the list of lists to the longest sublist, appending 0's
padded_tokens = list(zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
if desired_token_length > len(longest_token):
# Removes the "dummy token".
padded_tokens.pop()
# Truncates all the tokens to the desired length, and return the result.
return [list(token[:desired_token_length]) for token in padded_tokens]
评论列表
文章目录