DrishtiSharma commited on
Commit
59727ea
Β·
verified Β·
1 Parent(s): 10150e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -133
app.py CHANGED
@@ -22,7 +22,6 @@ from langchain_community.tools.sql_database.tool import (
22
  )
23
  from langchain_community.utilities.sql_database import SQLDatabase
24
  from datasets import load_dataset
25
- from difflib import get_close_matches
26
  import tempfile
27
 
28
  st.title("SQL-RAG Using CrewAI πŸš€")
@@ -177,138 +176,6 @@ def escape_markdown(text):
177
  escape_chars = r"(\*|_|`|~)"
178
  return re.sub(escape_chars, r"\\\1", text)
179
 
180
-
181
- # Synonym mapping for flexible query understanding
182
- COLUMN_SYNONYMS = {
183
- "job_title": ["job title", "job role", "role", "designation", "position", "job responsibility", "occupation"],
184
- "experience_level": ["experience level", "seniority", "experience", "career stage", "years of experience"],
185
- "employment_type": ["employment type", "job type", "contract type", "employment status", "type of employment"],
186
- "salary_in_usd": ["salary", "income", "earnings", "pay", "wage", "compensation", "amount", "paid"],
187
- "remote_ratio": ["remote work", "work from home", "remote ratio", "remote", "telecommute", "wfh"],
188
- "company_size": ["company size", "organization size", "business size", "firm size", "big", "small"],
189
- #"employee_residence": ["country", "residence", "location", "employee location"],
190
- "company_location": ["company location", "office location", "company country", "headquarters", "location", "located", "area"],
191
- }
192
-
193
-
194
- # Fuzzy matcher for mapping query terms to dataset columns
195
- def fuzzy_match_columns(query):
196
- query = query.lower()
197
- all_synonyms = {synonym: col for col, synonyms in COLUMN_SYNONYMS.items() for synonym in synonyms}
198
- words = query.replace("and", "").replace("vs", "").replace("by", "").split()
199
-
200
- matched_columns = []
201
- for word in words:
202
- matches = get_close_matches(word, all_synonyms.keys(), n=1, cutoff=0.6)
203
- matched_columns.extend([all_synonyms[match] for match in matches])
204
-
205
- return list(dict.fromkeys(matched_columns))
206
-
207
- # Ask LLM to suggest relevant columns if fuzzy matching fails
208
- def ask_llm_for_columns(query, llm, df):
209
- columns = ', '.join(df.columns)
210
- prompt = f"""
211
- Analyze this user query and suggest the most relevant dataset columns for visualization.
212
-
213
- Query: "{query}"
214
-
215
- Available Columns: {columns}
216
-
217
- Respond in this JSON format:
218
- {{
219
- "x_axis": "column_name",
220
- "y_axis": "column_name",
221
- "group_by": "optional_column_name"
222
- }}
223
- """
224
-
225
- response = llm.generate(prompt)
226
- try:
227
- suggestion = json.loads(response)
228
- return suggestion
229
- except json.JSONDecodeError:
230
- st.error("⚠️ Failed to interpret AI response. Please refine your query.")
231
- return None
232
-
233
- # Add min, max, and average salary annotations to the chart
234
- def add_stats_to_figure(fig, df, y_axis):
235
- min_salary = df[y_axis].min()
236
- max_salary = df[y_axis].max()
237
- avg_salary = df[y_axis].mean()
238
-
239
- fig.add_annotation(
240
- text=f"Min: ${min_salary:,.2f} | Max: ${max_salary:,.2f} | Avg: ${avg_salary:,.2f}",
241
- xref="paper", yref="paper",
242
- x=0.5, y=1.1,
243
- showarrow=False,
244
- font=dict(size=12, color="black"),
245
- bgcolor="rgba(255, 255, 255, 0.7)"
246
- )
247
- return fig
248
-
249
- # Unified Visualization Generator with Fuzzy Matching and LLM Fallback
250
- def generate_visual_from_query(query, df, llm=None):
251
- try:
252
- # Step 1: Attempt Fuzzy Matching
253
- matched_columns = fuzzy_match_columns(query)
254
-
255
- # Step 2: Fallback to LLM if no columns are matched
256
- if not matched_columns and llm:
257
- st.info("πŸ€– No match found. Asking AI for suggestions...")
258
- suggestion = ask_llm_for_columns(query, llm, df)
259
- if suggestion:
260
- matched_columns = [suggestion.get("x_axis"), suggestion.get("group_by")]
261
-
262
- # Step 3: Process Matched Columns
263
- if len(matched_columns) >= 2:
264
- x_axis, group_by = matched_columns[0], matched_columns[1]
265
- elif len(matched_columns) == 1:
266
- x_axis, group_by = matched_columns[0], None
267
- else:
268
- st.warning("❓ No matching columns found. Try rephrasing your query.")
269
- return None
270
-
271
- # Step 4: Visualization Generation
272
-
273
- # Distribution Plot
274
- if "distribution" in query:
275
- fig = px.box(df, x=x_axis, y="salary_in_usd", color=group_by,
276
- title=f"Salary Distribution by {x_axis.replace('_', ' ').title()}"
277
- + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
278
- return add_stats_to_figure(fig, df, "salary_in_usd")
279
-
280
- # Average Salary Plot
281
- elif "average" in query or "mean" in query:
282
- grouped_df = df.groupby([x_axis] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
283
- fig = px.bar(grouped_df, x=x_axis, y="salary_in_usd", color=group_by,
284
- title=f"Average Salary by {x_axis.replace('_', ' ').title()}"
285
- + (f" and {group_by.replace('_', ' ').title()}" if group_by else ""))
286
- return add_stats_to_figure(fig, df, "salary_in_usd")
287
-
288
- # Salary Trends Over Time
289
- elif "trend" in query and "work_year" in df.columns:
290
- grouped_df = df.groupby(["work_year", x_axis])["salary_in_usd"].mean().reset_index()
291
- fig = px.line(grouped_df, x="work_year", y="salary_in_usd", color=x_axis,
292
- title=f"Salary Trend Over Years by {x_axis.replace('_', ' ').title()}")
293
- return add_stats_to_figure(fig, df, "salary_in_usd")
294
-
295
- # Remote Work Impact
296
- elif "remote" in query:
297
- grouped_df = df.groupby(["remote_ratio"] + ([group_by] if group_by else []))["salary_in_usd"].mean().reset_index()
298
- fig = px.bar(grouped_df, x="remote_ratio", y="salary_in_usd", color=group_by,
299
- title="Remote Work Impact on Salary")
300
- return add_stats_to_figure(fig, df, "salary_in_usd")
301
-
302
- # No Specific Match
303
- else:
304
- st.warning("⚠️ No suitable visualization to display!")
305
- return None
306
-
307
- except Exception as e:
308
- st.error(f"Error generating visualization: {e}")
309
- return None
310
-
311
-
312
  # SQL-RAG Analysis
313
  if st.session_state.df is not None:
314
  temp_dir = tempfile.TemporaryDirectory()
@@ -396,6 +263,8 @@ if st.session_state.df is not None:
396
  context=[analyze_data],
397
  )
398
 
 
 
399
  # Separate Crews for report and conclusion
400
  crew_report = Crew(
401
  agents=[sql_dev, data_analyst, report_writer],
@@ -487,3 +356,4 @@ else:
487
  with st.sidebar:
488
  st.header("πŸ“š Reference:")
489
  st.markdown("[SQL Agents w CrewAI & Llama 3 - Plaban Nayak](https://github.com/plaban1981/Agents/blob/main/SQL_Agents_with_CrewAI_and_Llama_3.ipynb)")
 
 
22
  )
23
  from langchain_community.utilities.sql_database import SQLDatabase
24
  from datasets import load_dataset
 
25
  import tempfile
26
 
27
  st.title("SQL-RAG Using CrewAI πŸš€")
 
176
  escape_chars = r"(\*|_|`|~)"
177
  return re.sub(escape_chars, r"\\\1", text)
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  # SQL-RAG Analysis
180
  if st.session_state.df is not None:
181
  temp_dir = tempfile.TemporaryDirectory()
 
263
  context=[analyze_data],
264
  )
265
 
266
+
267
+
268
  # Separate Crews for report and conclusion
269
  crew_report = Crew(
270
  agents=[sql_dev, data_analyst, report_writer],
 
356
  with st.sidebar:
357
  st.header("πŸ“š Reference:")
358
  st.markdown("[SQL Agents w CrewAI & Llama 3 - Plaban Nayak](https://github.com/plaban1981/Agents/blob/main/SQL_Agents_with_CrewAI_and_Llama_3.ipynb)")
359
+