Miscellaneous scripts
This repository contains miscellaneous scripts that does not fit in one repository, yet I will use them sometimes for my personal use. Note that some of the scripts might contain hardcoded paths and opinionated presets, and you are advised to inspect them before actually using.
Loading...
Searching...
No Matches
dirty_tokens.py
Go to the documentation of this file.
1import re
2import tiktoken
3from typing import Dict
4
5enc = tiktoken.get_encoding("o200k_base")
6# enc = tiktoken.get_encoding("cl100k_base")
7
8def contains_chinese(text: str) -> bool:
9 """
10 Check if the input text contains any Chinese characters.
11 Returns True if at least one Chinese character is found.
12 """
13 return re.search(r'[\u4e00-\u9fff]', text) is not None
14
15def dump_long_token(n: int = -1) -> None:
16 f"""
17 Iterate through all tokens in the vocabulary.
18 For each token, decode it and check if it contains Chinese characters.
19 Collect tokens containing Chinese characters into a dictionary.
20 Sort the tokens by the length of their decoded string (descending).
21 Print the top {n} longest tokens containing Chinese characters.
22 """
23 token_dict: Dict[int, str] = {}
24 for i in range(enc.n_vocab):
25 try:
26 c = enc.decode([i]) # Decode the token id to string
27 if contains_chinese(c):
28 token_dict[i] = c # Store token id and string if it contains Chinese
29 except:
30 pass # Ignore decoding errors
31
32 # Sort tokens by string length (descending) and take the top n
33 long_tokens = list(sorted(token_dict.items(), key=lambda x: len(x[1]), reverse=True))[:n]
34 with open("long_chinese_tokens.txt", "w", encoding="utf-8") as f:
35 for i, c in long_tokens:
36 f.write(f'{i}: "{c}"\n')
37 print('Top tokens written to long_chinese_tokens.txt')
38
39if __name__ == '__main__':
bool contains_chinese(str text)
None dump_long_token(int n=-1)