LawGPT / tools /clear_law.py
Jun1113's picture
Upload folder using huggingface_hub
484b605 verified
import re
import json
class read_lawfile:
def __init__(self, chapter_moder=r"第[零一二三四五六七八九十百千万]+章 .+\b", entry_mode=r"第[零一二三四五六七八九十百千万]+条\b"):
# 识别章和节
self.chapter_mode = chapter_moder
self.entry_mode = entry_mode
def read_file(self, file_path):
# 读取文件
self.law = {}
f = open(file_path, encoding='utf-8')
content = f.read()
content = content.replace("\n\n", "\n")
content = content.replace("##", "")
# print(content)
chapter_p = re.search(self.chapter_mode, content)
while chapter_p is not None:
c_start = chapter_p.start()
c_end = chapter_p.end()
key = content[c_start:c_end]
content = content[c_end:]
chapter_p = re.search(self.chapter_mode, content)
if chapter_p is not None:
end = chapter_p.start()
c_content = content[:end]
self.law[key] = self.read_entrys(c_content)
# print(content[c_start:c_end])
else:
self.law[key] = self.read_entrys(content)
f.close()
return self.law
def read_entrys(self, content):
entrys = {}
entry_p = re.search(self.entry_mode, content)
while entry_p is not None:
e_start = entry_p.start()
e_end = entry_p.end()
key = content[e_start:e_end]
content = content[e_end+1:]
entry_p = re.search(self.entry_mode, content)
if entry_p is not None:
end = entry_p.start()
e_content = content[:end]
entrys[key] = e_content
else:
entrys[key] = content
return entrys
# entry_p = re.search(entry_mode, content)
# while entry_p is not None:
# start = entry_p.start()
# end = entry_p.end()
# # print(content[start:end])
# content = content[end:]
# law[content[start:end]] = read_entrys(content)
# chapter_p = re.search(chapter_mode, content)
def show(self):
for key in self.law:
print(key, '\n')
for item in self.law[key]:
print(item, ' ', self.law[key][item])
if __name__ == '__main__':
file_path = "D:/11496/Documents/project/Laws-master/经济法/价格法(1997-12-29).md"
r = read_lawfile()
dict = r.read_file(file_path)
r.show()
print(dict)
with open('./a.json', 'w') as f:
# json.dumps(dict, f, ensure_ascii=False)
json.dump(dict, f, ensure_ascii=False)