Compare commits

...

2 Commits

Author SHA1 Message Date
dfdc370b8e
docs: README 2023-03-29 02:35:57 +08:00
5302fdcc7c
fix: add eng to tokens function 2023-03-29 02:35:44 +08:00
3 changed files with 24 additions and 0 deletions

5
README.md Normal file
View File

@ -0,0 +1,5 @@
# Transformer-based Translator
Simple Transformer
! [](image/Screenshot_20230329_023305.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

View File

@ -22,6 +22,25 @@ SHOW_NUM = 5
NUM_HEADS = 8
DROPOUT_RATE = 0.5
def en2tokens(en_sentence, en_vocab, for_model=False, en_seq=50):
'''
English to tokens
Args:
en_sentence (str)
en_vocab (torchtext.Vocab)
for_model (bool, default=False): if `True`, it will add <SOS>, <END>, <PAD> tokens
en_seq (int): for padding <PAD>
Outputs:
tokens (LongTensor): (b,)
'''
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
tokens = en_vocab( tokenizer(en_sentence.lower()) )
if for_model:
tokens = [ en_vocab['<SOS>'] ] + tokens + [ en_vocab['<END>'] ]
tokens = tokens + [ en_vocab['<PAD>'] for _ in range(en_seq - len(tokens)) ]
return torch.LongTensor(tokens)
def predict(en_str, model, en_vocab, ch_vocab):