-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
61 lines (37 loc) · 1.28 KB
/
test.py
File metadata and controls
61 lines (37 loc) · 1.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#encoding=utf-8
from __future__ import unicode_literals
import sys
import os
import tkitJson
import tqdm
# 切换到上级目录
sys.path.append("../")
# 引入本地库
from PreTrans import PreTrans
from transformers import BertTokenizer
# /kaggle/input/mcbert/mc_bert_base
tokenizer = BertTokenizer.from_pretrained('/home/terry/dev/model/base')
# teacher_model= AutoModel.from_pretrained('/home/terry/dev/model/base')
file="../data"
#
#
g = os.walk(file)
fileList=[]
for path,dir_list,file_list in g:
for file_name in file_list:
# print(os.path.join(path, file_name) )
fileList.append(os.path.join(path, file_name))
fileList=fileList[:5]
# fileList
P=PreTrans(tokenizer,max_length=512)
# lw=P.autoCut("借助python 脚本,可以轻松实现,原理就是:字符串的按照固定长度拆分这个模块提供了正则表达式匹配操作,正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配。")
# lw[:2]
for fileName in tqdm.tqdm(fileList):
Tjson=tkitJson.Json(fileName)
for it in Tjson.auto_load():
P.autoCut(it['text'])
#每个文件保存一次,便于获取
P.getTok()
P.save()
for it in P.load():
print(it["data"]["input_ids"].size())