Spaces:
Running
Running
updated requirements
Browse files- .idea/code-chunker.iml +1 -0
- Chunker.py +15 -15
- app.py +19 -1
- requirements.txt +45 -0
.idea/code-chunker.iml
CHANGED
@@ -10,5 +10,6 @@
|
|
10 |
</component>
|
11 |
<component name="PackageRequirementsSettings">
|
12 |
<option name="removeUnused" value="true" />
|
|
|
13 |
</component>
|
14 |
</module>
|
|
|
10 |
</component>
|
11 |
<component name="PackageRequirementsSettings">
|
12 |
<option name="removeUnused" value="true" />
|
13 |
+
<option name="modifyBaseFiles" value="true" />
|
14 |
</component>
|
15 |
</module>
|
Chunker.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from CodeParser import CodeParser
|
3 |
-
from
|
4 |
-
|
5 |
|
6 |
|
7 |
class Chunker(ABC):
|
@@ -20,19 +19,20 @@ class Chunker(ABC):
|
|
20 |
def print_chunks(chunks):
|
21 |
for chunk_number, chunk_code in chunks.items():
|
22 |
print(f"Chunk {chunk_number}:")
|
23 |
-
print("="*40)
|
24 |
print(chunk_code)
|
25 |
-
print("="*40)
|
26 |
|
27 |
@staticmethod
|
28 |
def consolidate_chunks_into_file(chunks):
|
29 |
return "\n".join(chunks.values())
|
30 |
-
|
31 |
@staticmethod
|
32 |
def count_lines(consolidated_chunks):
|
33 |
lines = consolidated_chunks.split("\n")
|
34 |
return len(lines)
|
35 |
|
|
|
36 |
class CodeChunker(Chunker):
|
37 |
def __init__(self, file_extension, encoding_name="gpt-4"):
|
38 |
super().__init__(encoding_name)
|
@@ -60,15 +60,16 @@ class CodeChunker(Chunker):
|
|
60 |
if highest_comment_line: # If a highest comment line exists, add it
|
61 |
adjusted_breakpoints.append(highest_comment_line)
|
62 |
else:
|
63 |
-
adjusted_breakpoints.append(
|
|
|
64 |
|
65 |
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
66 |
-
|
67 |
while i < len(lines):
|
68 |
line = lines[i]
|
69 |
new_token_count = count_tokens(line, self.encoding_name)
|
70 |
if token_count + new_token_count > token_limit:
|
71 |
-
|
72 |
# Set the stop line to the last breakpoint before the current line
|
73 |
if i in breakpoints:
|
74 |
stop_line = i
|
@@ -79,20 +80,20 @@ class CodeChunker(Chunker):
|
|
79 |
if stop_line == start_line and i not in breakpoints:
|
80 |
token_count += new_token_count
|
81 |
i += 1
|
82 |
-
|
83 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
84 |
elif stop_line == start_line and i == stop_line:
|
85 |
token_count += new_token_count
|
86 |
i += 1
|
87 |
-
|
88 |
-
|
89 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
90 |
elif stop_line == start_line and i in breakpoints:
|
91 |
current_chunk = "\n".join(lines[start_line:stop_line])
|
92 |
if current_chunk.strip(): # If the current chunk is not just whitespace
|
93 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
94 |
chunk_number += 1
|
95 |
-
|
96 |
token_count = 0
|
97 |
start_line = i
|
98 |
i += 1
|
@@ -103,7 +104,7 @@ class CodeChunker(Chunker):
|
|
103 |
if current_chunk.strip():
|
104 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
105 |
chunk_number += 1
|
106 |
-
|
107 |
i = stop_line
|
108 |
token_count = 0
|
109 |
start_line = stop_line
|
@@ -116,9 +117,8 @@ class CodeChunker(Chunker):
|
|
116 |
current_chunk_code = "\n".join(lines[start_line:])
|
117 |
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
118 |
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
119 |
-
|
120 |
return chunks
|
121 |
|
122 |
def get_chunk(self, chunked_codebase, chunk_number):
|
123 |
return chunked_codebase[chunk_number]
|
124 |
-
|
|
|
1 |
from abc import ABC, abstractmethod
|
2 |
from CodeParser import CodeParser
|
3 |
+
from utils import count_tokens
|
|
|
4 |
|
5 |
|
6 |
class Chunker(ABC):
|
|
|
19 |
def print_chunks(chunks):
|
20 |
for chunk_number, chunk_code in chunks.items():
|
21 |
print(f"Chunk {chunk_number}:")
|
22 |
+
print("=" * 40)
|
23 |
print(chunk_code)
|
24 |
+
print("=" * 40)
|
25 |
|
26 |
@staticmethod
|
27 |
def consolidate_chunks_into_file(chunks):
|
28 |
return "\n".join(chunks.values())
|
29 |
+
|
30 |
@staticmethod
|
31 |
def count_lines(consolidated_chunks):
|
32 |
lines = consolidated_chunks.split("\n")
|
33 |
return len(lines)
|
34 |
|
35 |
+
|
36 |
class CodeChunker(Chunker):
|
37 |
def __init__(self, file_extension, encoding_name="gpt-4"):
|
38 |
super().__init__(encoding_name)
|
|
|
60 |
if highest_comment_line: # If a highest comment line exists, add it
|
61 |
adjusted_breakpoints.append(highest_comment_line)
|
62 |
else:
|
63 |
+
adjusted_breakpoints.append(
|
64 |
+
bp) # If no comments were found before the breakpoint, add the original breakpoint
|
65 |
|
66 |
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
67 |
+
|
68 |
while i < len(lines):
|
69 |
line = lines[i]
|
70 |
new_token_count = count_tokens(line, self.encoding_name)
|
71 |
if token_count + new_token_count > token_limit:
|
72 |
+
|
73 |
# Set the stop line to the last breakpoint before the current line
|
74 |
if i in breakpoints:
|
75 |
stop_line = i
|
|
|
80 |
if stop_line == start_line and i not in breakpoints:
|
81 |
token_count += new_token_count
|
82 |
i += 1
|
83 |
+
|
84 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
85 |
elif stop_line == start_line and i == stop_line:
|
86 |
token_count += new_token_count
|
87 |
i += 1
|
88 |
+
|
89 |
+
|
90 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
91 |
elif stop_line == start_line and i in breakpoints:
|
92 |
current_chunk = "\n".join(lines[start_line:stop_line])
|
93 |
if current_chunk.strip(): # If the current chunk is not just whitespace
|
94 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
95 |
chunk_number += 1
|
96 |
+
|
97 |
token_count = 0
|
98 |
start_line = i
|
99 |
i += 1
|
|
|
104 |
if current_chunk.strip():
|
105 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
106 |
chunk_number += 1
|
107 |
+
|
108 |
i = stop_line
|
109 |
token_count = 0
|
110 |
start_line = stop_line
|
|
|
117 |
current_chunk_code = "\n".join(lines[start_line:])
|
118 |
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
119 |
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
120 |
+
|
121 |
return chunks
|
122 |
|
123 |
def get_chunk(self, chunked_codebase, chunk_number):
|
124 |
return chunked_codebase[chunk_number]
|
|
app.py
CHANGED
@@ -1,10 +1,28 @@
|
|
1 |
import streamlit as st
|
2 |
from utils import load_json, count_tokens
|
3 |
import json
|
|
|
4 |
|
5 |
# Set up the Streamlit page configuration
|
6 |
st.set_page_config(page_title="Cintra Code Chunker", layout="wide")
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def main():
|
9 |
# Streamlit widgets for file selection
|
10 |
st.title("Cintra Code Chunker")
|
@@ -38,4 +56,4 @@ def main():
|
|
38 |
|
39 |
|
40 |
if __name__ == "__main__":
|
41 |
-
main()
|
|
|
1 |
import streamlit as st
|
2 |
from utils import load_json, count_tokens
|
3 |
import json
|
4 |
+
import os
|
5 |
|
6 |
# Set up the Streamlit page configuration
|
7 |
st.set_page_config(page_title="Cintra Code Chunker", layout="wide")
|
8 |
|
9 |
+
# Slider to select a value
|
10 |
+
x = st.slider("Select a value")
|
11 |
+
st.write(x, "squared is", x * x)
|
12 |
+
|
13 |
+
|
14 |
+
code_files_directory = "example_code_files"
|
15 |
+
code_files = os.listdir(code_files_directory)
|
16 |
+
|
17 |
+
# Dropdown menu for the user to select a code file
|
18 |
+
selected_file = st.selectbox("Select a code file", code_files)
|
19 |
+
|
20 |
+
file_path = os.path.join(code_files_directory, selected_file)
|
21 |
+
with open(file_path, "r") as file:
|
22 |
+
code_content = file.read()
|
23 |
+
st.code(code_content, language="python")
|
24 |
+
|
25 |
+
|
26 |
def main():
|
27 |
# Streamlit widgets for file selection
|
28 |
st.title("Cintra Code Chunker")
|
|
|
56 |
|
57 |
|
58 |
if __name__ == "__main__":
|
59 |
+
main()
|
requirements.txt
CHANGED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.3.0
|
2 |
+
attrs==23.2.0
|
3 |
+
blinker==1.7.0
|
4 |
+
cachetools==5.3.3
|
5 |
+
certifi==2024.2.2
|
6 |
+
charset-normalizer==3.3.2
|
7 |
+
click==8.1.7
|
8 |
+
colorama==0.4.6
|
9 |
+
gitdb==4.0.11
|
10 |
+
GitPython==3.1.43
|
11 |
+
idna==3.6
|
12 |
+
Jinja2==3.1.3
|
13 |
+
jsonschema==4.21.1
|
14 |
+
jsonschema-specifications==2023.12.1
|
15 |
+
markdown-it-py==3.0.0
|
16 |
+
MarkupSafe==2.1.5
|
17 |
+
mdurl==0.1.2
|
18 |
+
numpy==1.26.4
|
19 |
+
packaging==24.0
|
20 |
+
pandas==2.2.1
|
21 |
+
pillow==10.3.0
|
22 |
+
protobuf==4.25.3
|
23 |
+
pyarrow==15.0.2
|
24 |
+
pydeck==0.8.1b0
|
25 |
+
Pygments==2.17.2
|
26 |
+
python-dateutil==2.9.0.post0
|
27 |
+
pytz==2024.1
|
28 |
+
referencing==0.34.0
|
29 |
+
requests==2.31.0
|
30 |
+
rich==13.7.1
|
31 |
+
rpds-py==0.18.0
|
32 |
+
six==1.16.0
|
33 |
+
smmap==5.0.1
|
34 |
+
streamlit==1.33.0
|
35 |
+
tenacity==8.2.3
|
36 |
+
regex==2023.12.25
|
37 |
+
tiktoken==0.6.0
|
38 |
+
tree-sitter==0.21.3
|
39 |
+
toml==0.10.2
|
40 |
+
toolz==0.12.1
|
41 |
+
tornado==6.4
|
42 |
+
typing_extensions==4.11.0
|
43 |
+
tzdata==2024.1
|
44 |
+
urllib3==2.2.1
|
45 |
+
watchdog==4.0.0
|