def XsSeg2Xae(Xs, Xs_mask, segs, maxUtt, maxLen, nResample=None, check_output=False):
Xae = np.split(Xs, len(Xs))
FRAME_SIZE = Xs.shape[-1]
deletedChars = np.zeros((len(Xae), maxUtt))
oneLetter = np.zeros((len(Xae), maxUtt))
for i,utt in enumerate(Xae):
utt_target = np.zeros((maxUtt, nResample if nResample else maxLen, FRAME_SIZE))
utt = np.squeeze(utt, 0)[np.logical_not(Xs_mask[i])]
utt = np.split(utt, np.where(segs[i,:len(utt)])[0])
if len((utt[0])) == 0:
utt.pop(0)
n_words = min(len(utt), maxUtt)
padwords = maxUtt - n_words
for j in range(n_words):
w_len = min(len(utt[j]), maxLen)
w_target = np.zeros((nResample if nResample else maxLen, FRAME_SIZE))
deletedChars[i,padwords+j] += max(0, len(utt[j]) - maxLen)
oneLetter[i,padwords+j] += int(w_len == 1)
if nResample:
if w_len > 1:
word = resample(utt[j][:w_len], nResample)
else:
word = np.repeat(utt[j][:w_len], nResample, axis=0)
w_len = maxLen
else:
word = utt[j][:w_len]
w_target[-w_len:] = word
utt[j] = w_target
utt_target[padwords+j] = utt[j]
extraWDel = 0
for j in range(maxUtt, len(utt)):
extraWDel += len(utt[j])
## Uniformly distribute clipping penaresh2lty for excess words
deletedChars[i,:] += float(extraWDel) / maxUtt
Xae[i] = utt_target
Xae = np.stack(Xae)
## NOTE: Reconstitution will fail if there has been any clipping.
## Do not use this feature unless maxutt and maxlen are large enough
## to make clipping very unlikely.
## Currently only works in acoustic mode.
if check_output:
for i in range(len(Xs)):
src = Xs[i][np.logical_not(Xs_mask[i])]
target = Xae[i]
reconstituted = np.zeros((0,FRAME_SIZE))
for wi in range(maxUtt):
w = target[wi][np.where(target[wi].any(-1))]
reconstituted = np.concatenate([reconstituted, w])
for j in range(len(src)):
assert np.allclose(src[j], reconstituted[j]), \
'''Reconstitution of MFCC frames failed at timestep %d.
Source region: %s\n Reconstituted region: %s''' \
%(j, src[j-1:j+2], reconstituted[j-1:j+2])
return Xae, deletedChars, oneLetter
评论列表
文章目录