Noonisy
几种tokenize的方法
2022-12-21
阅读:487

几种tokenize的方法


1.nltk

import nltk

sentence = "No left turn sign on a street sign at 3rd Ave. A sky view of London, England- the palace and parliament. a retro kitchen with brown wood-paneled designs and other 1960's-era objects."

tokens = nltk.word_tokenize(sentence)
' | '.join([i for i in tokens])

"No | left | turn | sign | on | a | street | sign | at | 3rd | Ave. | A | sky | view | of | London | , | England- | the | palace | and | parliament | . | a | retro | kitchen | with | brown | wood-paneled | designs | and | other | 1960's-era | objects | ."

2.spacy

from spacy.lang.en import English
nlp = English()

sentence = "No left turn sign on a street sign at 3rd Ave. A sky view of London, England- the palace and parliament. a retro kitchen with brown wood-paneled designs and other 1960's-era objects."

tokens = nlp(sentence)
token_list = []
for token in tokens:
    token_list.append(token.text)
    
' | '.join([i for i in token_list])

"No | left | turn | sign | on | a | street | sign | at | 3rd | Ave | . | A | sky | view | of | London | , | England- | the | palace | and | parliament | . | a | retro | kitchen | with | brown | wood | - | paneled | designs | and | other | 1960's | - | era | objects | ."

3.stanfordnlp

import stanfordnlp

nlp = stanfordnlp.Pipeline(processors='tokenize', lang='en')
sentence = "No left turn sign on a street sign at 3rd Ave. A sky view of London, England- the palace and parliament. a retro kitchen with brown wood-paneled designs and other 1960's-era objects."

doc = nlp(sentence)
token_list = []
for i, sentence in enumerate(doc.sentences):
    token_list.append(' | '.join(token.text for token in sentence.tokens)) 
    
token_list

['No | left | turn | sign | on | a | street | sign | at | 3rd',
 'Ave | .',
 'A | sky | view | of | London | , | England | - | the | palace | and | parliament | .',
 "a | retro | kitchen | with | brown | wood | - | paneled | designs | and | other | 1960 | 's | - | era | objects | ."]
最后编辑于:2022 年 12 月 26 日 18:13
邮箱格式错误
网址请用http://或https://开头