Java与Python实现完全对齐的字级别tokenizer,有何最佳方案?
- 内容介绍
- 文章标签
- 相关推荐
本文共计184个文字,预计阅读时间需要1分钟。
pythondef tokenize_to_str_list(textString): split_tokens=[] for i in range(len(textString)): split_tokens.append(textString[i]) return split_tokens
def convert_to_int_list(split_tokens): output=[] for token in split_tokens: output.append(int(token))
python侧:
def tokenize_to_str_list(textString):split_tokens = []
for i in range(len(textString)):
split_tokens.append(textString[i])
return split_tokens
def convert_to_int_list(split_tokens):
output = []
for token in split_tokens:
if token in char2id:
output.append(char2id[item])
return
java侧:
public String[] tokenize_to_str_list(final String textString) {int textLength = textString.length();
String[] split_tokens = new String[textLength];
for(int i=0; i < textLength; i++){
split_tokens[i]= String.valueOf(textString.charAt(i));
}
return split_tokens;
}
public int[] convert_to_int_list(final String[] split_tokens) {
int seqLen = split_tokens.length;
int[] output = new int[seqLen];
int index = 0
for(int i = 0; i < seqLen; i++){
if(char2id.containsKey(split_tokens[i])){
output[index] = char2id.get(split_tokens[i]);
index = index + 1;
}
}
return output;
}
本文共计184个文字,预计阅读时间需要1分钟。
pythondef tokenize_to_str_list(textString): split_tokens=[] for i in range(len(textString)): split_tokens.append(textString[i]) return split_tokens
def convert_to_int_list(split_tokens): output=[] for token in split_tokens: output.append(int(token))
python侧:
def tokenize_to_str_list(textString):split_tokens = []
for i in range(len(textString)):
split_tokens.append(textString[i])
return split_tokens
def convert_to_int_list(split_tokens):
output = []
for token in split_tokens:
if token in char2id:
output.append(char2id[item])
return
java侧:
public String[] tokenize_to_str_list(final String textString) {int textLength = textString.length();
String[] split_tokens = new String[textLength];
for(int i=0; i < textLength; i++){
split_tokens[i]= String.valueOf(textString.charAt(i));
}
return split_tokens;
}
public int[] convert_to_int_list(final String[] split_tokens) {
int seqLen = split_tokens.length;
int[] output = new int[seqLen];
int index = 0
for(int i = 0; i < seqLen; i++){
if(char2id.containsKey(split_tokens[i])){
output[index] = char2id.get(split_tokens[i]);
index = index + 1;
}
}
return output;
}

