def transform(self, docs, buffer_size=100):
args = shlex.split(self.RUN_TAGGER_CMD) + ['--output-format', 'conll']
proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
buffer_sema = threading.BoundedSemaphore(buffer_size)
t = threading.Thread(target=self._write_input, args=(docs, proc, buffer_sema))
t.start()
while True:
# reading can only follow writing unless EOF is reached so buffer_sema >= 0
res = []
while True:
line = proc.stdout.readline().decode('utf-8').rstrip()
if line == '':
break
word, tag, confidence = line.split('\t')
res.append((word, tag, float(confidence)))
if not res:
break
yield res
buffer_sema.release()
t.join()
评论列表
文章目录