Spaces:

SimulaMet-HOST
/

SoccerRAG

Sleeping

App Files Files Community

buzzCraft commited on Apr 13, 2024

Commit

68f18b5

1 Parent(s): 9baf55e

init

Browse files

Files changed (17) hide show

.env_demo +4 -0
README.md +31 -0
data/Dataset/augmented.csv +7 -0
data/Dataset/augmented_leauges.csv +21 -0
main.py +19 -0
requirements.txt +11 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/extractor.cpython-311.pyc +0 -0
src/__pycache__/sql_chain.cpython-311.pyc +0 -0
src/conf/extractor_prompt.json +4 -0
src/conf/schema.json +61 -0
src/conf/sqls.json +105 -0
src/database/database.py +445 -0
src/database/readdata.ipynb +0 -0
src/extractor.py +558 -0
src/sql_chain.py +160 -0

.env_demo ADDED Viewed

	@@ -0,0 +1,4 @@

+OPENAI_API_KEY=API_KEY_HERE
+LANGSMITH = False
+LANGSMITH_API_KEY=API_KEY_HERE -NOT NEEDED IF LANGSMITH IS FALSE
+```

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# SoccerRAG: Multimodal Soccer Information Retrieval via Natural Queries
+## Setup
+````bash
+pip install -r requirements.txt
+````
+Rename .env_demo to .env and fill in the required fields.
+## Required data
+The data required to run the code is not included in this repository.
+The data can be downloaded from the [Soccernet](https://www.soccer-net.org/data).
+Files needed are:
+* Labels-v2.json
+* Labels-captions.json
+## Running the code
+To run the code, execute the following command:
+````bash
+python main.py
+````
+The code will prompt you to enter a natural language query.
+## Results
+..
+## Acknowledgements
+..
+## Citation
+..

data/Dataset/augmented.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+name,augmented_name
+Manchester United, ManU
+Manchester United, Man U
+Manchester United, ManUnt
+Manchester United, Manchester U
+Manchester United, Manchester Unt
+Manchester United, Man United

data/Dataset/augmented_leauges.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+name,augmented_name
+england_epl, epl
+england_epl, premier league
+england_epl, english premier league
+england_epl, english premier
+europe_uefa-champions-league, uefa champions league
+europe_uefa-champions-league, champions league
+europe_uefa-champions-league, cl
+europe_uefa-champions-league, ucl
+france_ligue-1, ligue 1
+france_ligue-1, ligue1
+germany_bundesliga, bundesliga
+germany_bundesliga, bundes liga
+germany_bundesliga, bundes
+italy_serie-a, serie a
+italy_serie-a, seriea
+italy_serie-a, serie-a
+spain_laliga, la liga
+spain_laliga, laliga
+spain_laliga, la-liga

main.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from src.extractor import create_extractor
+from src.sql_chain import create_agent
+ex = create_extractor()
+ag = create_agent(llm_model="gpt-3.5-turbo-0125", verbose=False)
+# ag = create_agent(llm_model = "gpt-4-0125-preview")
+def query(prompt):
+    clean = ex.clean(prompt)
+    return ag.ask(clean)
+if __name__ == "__main__":
+    while True:
+        inp = input("Enter a query: ")
+        if inp == "exit":
+            break
+        ans, _ = query(inp)
+        print(ans["output"])
+    exit(0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+openai
+langchainhub
+langchain == 0.1.4
+langchain_openai
+langchain_experimental
+sqlalchemy
+python-dotenv
+chromadb
+python-Levenshtein
+rapidfuzz
+thefuzz

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (180 Bytes). View file

src/__pycache__/extractor.cpython-311.pyc ADDED Viewed

Binary file (26.1 kB). View file

src/__pycache__/sql_chain.cpython-311.pyc ADDED Viewed

Binary file (9.27 kB). View file

src/conf/extractor_prompt.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "extract_prompt": "Extract and save the relevant entities mentioned     in the following passage together with their properties.\\n\\n    Only extract the properties mentioned in the 'information_extraction' function.\\n\\n    The questions are football related. game_event can be things like yellow cards, goals, assists etc.\\n\\n    If a property is not present and is not required in the function parameters, do not include it in the output.\\n\\n    Passage:\\n    {input}\\n    ",
+}

src/conf/schema.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+    "properties": {
+        "person_name": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "db_table": "players",
+                "db_column": "name",
+                "pk_column": "hash",
+                "numeric": false
+            }
+        },
+        "team_name": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "db_table": "teams",
+                "db_column": "name",
+                "pk_column": "id",
+                "numeric": false,
+                "augmented_table": "augmented_teams",
+                "augmented_column": "augmented_name",
+                "augmented_fk": "team_id"
+            }
+        },
+        "year_season": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "db_table": "games",
+                "db_column": "season",
+                "pk_column": null,
+                "numeric": true
+            }
+        },
+        "in_game_event": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "db_table": "events",
+                "db_column": "label",
+                "pk_column": null,
+                "numeric": false
+            }
+        },
+            "league": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "db_table": "leagues",
+                "db_column": "name",
+                "pk_column": "id",
+                "numeric": false,
+                "augmented_table": "augmented_leagues",
+                "augmented_column": "augmented_name",
+                "augmented_fk": "league_id"
+            }
+        }
+    },
+    "required": []
+}

src/conf/sqls.json ADDED Viewed

	@@ -0,0 +1,105 @@

+[
+  {
+    "input": "List all teams",
+    "query": "SELECT * FROM teams;"
+  },
+  {
+    "input": "Find a player by name",
+    "query": "SELECT * FROM players WHERE name = 'name';"
+  },
+  {
+    "input": "Select the names of teams and calculate their total home and away goals in the 2016-2017 season. Count the total matches played and calculate the average goals per match. Order the teams by their total goals scored and limit the results to the top three.",
+    "query": "SELECT teams.name, SUM(CASE WHEN games.home_team_id = teams.id THEN games.goal_home ELSE 0 END) AS home_goals, SUM(CASE WHEN games.away_team_id = teams.id THEN games.goal_away ELSE 0 END) AS away_goals, COUNT(*) AS matches_played, (SUM(CASE WHEN games.home_team_id = teams.id THEN games.goal_home ELSE 0 END) + SUM(CASE WHEN games.away_team_id = teams.id THEN games.goal_away ELSE 0 END)) * 1.0 / COUNT(*) AS avg_goals_per_match FROM games INNER JOIN teams ON teams.id = games.home_team_id OR teams.id = games.away_team_id WHERE games.season = '2016-2017' GROUP BY teams.name ORDER BY (home_goals + away_goals) DESC LIMIT 3;');"
+  },
+  {
+    "input": "Retrieve the name and country of a player identified by a specific hash value.",
+    "query": "SELECT players.name, players.country FROM players WHERE players.hash = 'hash';"
+  },
+  {
+    "input": "Get information about what happened in a time period",
+    "query": "SELECT event_time_start, event_time_end, period, description FROM Commentary WHERE game_id = game_id AND period = period AND ABS(event_time_start - start_time) <= duration;"
+  },
+  {
+    "input": "For the a game with ID X, list the home team's name, players' names (with a captain indicator), and shirt numbers.",
+    "query": "SELECT t.name AS team_name, p.name || CASE WHEN l.captain THEN ' (C)' ELSE '' END AS player_name, l.shirt_number FROM games g JOIN teams t ON g.home_team_id = t.id JOIN game_lineup l ON t.id = l.team_id AND l.game_id = g.id JOIN players p ON l.player_id = p.hash WHERE g.id = X;"
+  },
+  {
+    "input": "Who was the home team, and away team in game X?",
+    "query": "SELECT home_team.name AS home_team, away_team.name AS away_team FROM games JOIN teams AS home_team ON games.home_team_id = home_team.id JOIN teams AS away_team ON games.away_team_id = away_team.id WHERE games.id = X;"
+  },
+  {
+    "input": "For game X, list all Shots on targets and goals, for each team (using their name not id) for each period",
+    "query": "SELECT t.name AS team_name, a.period, SUM(CASE WHEN a.label = 'Shots on target' THEN 1 ELSE 0 END) AS shots_on_target, SUM(CASE WHEN a.label = 'Goal' THEN 1 ELSE 0 END) AS goals FROM annotations a JOIN teams t ON a.team_id = t.id WHERE a.game_id = X AND (a.label = 'Shots on target' OR a.label = 'Goal') GROUP BY t.name, a.period ORDER BY t.name, a.period;"
+  },
+  {
+    "input": "How many offsides were caused by the away team in game X, also get the time of the event",
+    "query": "SELECT a.game_id, a.label, a.position, a.period FROM annotations a JOIN games g ON a.game_id = g.id WHERE a.game_id = X AND a.label = 'Offside' AND a.team_id = g.away_team_id;"
+  },
+  {
+    "input": "all goals scored by <team> in <season>",
+    "query": "SELECT t.name AS TeamName, g.season, SUM(CASE WHEN g.home_team_id = t.id THEN g.goal_home ELSE 0 END + CASE WHEN g.away_team_id = t.id THEN g.goal_away ELSE 0 END) AS TotalGoals FROM games g JOIN teams t ON g.home_team_id = t.id OR g.away_team_id = t.id WHERE t.name = '<team>' AND g.season = '<season>' GROUP BY t.name, g.season;"
+  },
+    {
+    "input": "All games played by <team> in <season> in <league>",
+    "query": "SELECT g.id, g.date, g.season, l.name AS LeagueName, ht.name AS HomeTeam, at.name AS AwayTeam, g.score FROM games g JOIN teams ht ON g.home_team_id = ht.id JOIN teams at ON g.away_team_id = at.id JOIN leagues l ON g.league_id = l.id WHERE (ht.name = '<team>' OR at.name = '<team>') AND l.name = '<league>' AND g.season = '<season>';"
+  },
+  {
+    "input": "List all teams that played against <team> in season <season> and league <league>",
+    "query": "SELECT DISTINCT CASE WHEN ht.name = '<team>' THEN at.name ELSE ht.name END AS OpponentTeam FROM games g JOIN teams ht ON g.home_team_id = ht.id JOIN teams at ON g.away_team_id = at.id JOIN leagues l ON g.league_id = l.id WHERE (ht.name = '<team>' OR at.name = '<team>') AND l.name = '<league>' AND g.season = '<season>' ORDER BY OpponentTeam;"
+  },
+  {
+    "input": "Get home and away stats for <team> in <season>",
+    "query": "WITH home_games AS (SELECT g.id, g.season, g.home_team_id AS team_id, CASE WHEN g.goal_home > g.goal_away THEN 1 ELSE 0 END AS won, CASE WHEN g.goal_home = g.goal_away THEN 1 ELSE 0 END AS draw, CASE WHEN g.goal_home < g.goal_away THEN 1 ELSE 0 END AS lost FROM games g JOIN teams t ON g.home_team_id = t.id WHERE t.name = '<team>' AND g.season = '<season>'), away_games AS (SELECT g.id, g.season, g.away_team_id AS team_id, CASE WHEN g.goal_away > g.goal_home THEN 1 ELSE 0 END AS won, CASE WHEN g.goal_away = g.goal_home THEN 1 ELSE 0 END AS draw, CASE WHEN g.goal_away < g.goal_home THEN 1 ELSE 0 END AS lost FROM games g JOIN teams t ON g.away_team_id = t.id WHERE t.name = '<team>' AND g.season = '<season>'), home_stats AS (SELECT COUNT(*) AS total_home_games, SUM(won) AS home_wins, SUM(draw) AS home_draws, SUM(lost) AS home_losses FROM home_games), away_stats AS (SELECT COUNT(*) AS total_away_games, SUM(won) AS away_wins, SUM(draw) AS away_draws, SUM(lost) AS away_losses FROM away_games) SELECT hs.total_home_games, hs.home_wins, hs.home_draws, hs.home_losses, as_stats.total_away_games, as_stats.away_wins, as_stats.away_draws, as_stats.away_losses FROM home_stats hs, away_stats as_stats;"
+  },
+  {
+    "input": "How many goals did <player> score in <season> in <league>?",
+    "query": "SELECT COUNT(*) AS goal_count FROM player_events pe JOIN players p ON pe.player_id = p.hash JOIN games g ON pe.game_id = g.id JOIN leagues l ON g.league_id = l.id JOIN player_event_labels pel ON pe.type = pel.id WHERE p.name = <player> AND g.season = <season> AND l.name = <league> AND pel.label = 'Goal';"
+  },
+  {
+    "input": "How many goals did <player> score in <season>?",
+    "query": "SELECT COUNT(*) AS goal_count FROM player_events pe JOIN players p ON pe.player_id = p.hash JOIN games g ON pe.game_id = g.id JOIN player_event_labels pel ON pe.type = pel.id WHERE p.name = <player> AND g.season = <season> AND pel.label = 'Goal';"
+  },
+  {
+    "input": "List all teams that played against <team> in season <season>",
+    "query": "SELECT DISTINCT opponent.name AS opponent_name FROM games JOIN teams AS opponent ON (opponent.id = games.home_team_id OR opponent.id = games.away_team_id) JOIN teams AS specified_team ON (specified_team.id = games.home_team_id OR specified_team.id = games.away_team_id) WHERE (games.home_team_id = (SELECT id FROM teams WHERE name = '<team>') OR games.away_team_id = (SELECT id FROM teams WHERE name = '<team>')) AND games.season = '<season>' AND opponent.name != '<team>'"
+  },
+  {
+    "input": "List all teams in <league> in <season>",
+    "query": "SELECT DISTINCT team.name FROM games JOIN teams team ON team.id = games.home_team_id OR team.id = games.away_team_id WHERE games.league_id = (SELECT id FROM leagues WHERE name = '<league_name>') AND games.season = '<season>'"
+  },
+  {
+    "input": "List all games in <league> in <season> with <event> in first half",
+    "query": "SELECT ht.name AS home_team, at.name AS away_team, g.score, g.date FROM games g JOIN leagues l ON g.league_id = l.id JOIN events e ON g.id = e.game_id AND g.home_team_id = e.team_id JOIN teams ht ON g.home_team_id = ht.id JOIN teams at ON g.away_team_id = at.id WHERE l.name = '<leauge>' AND g.season = '<season>' AND e.period = 1 AND e.label = '<event>' GROUP BY g.id;"
+  },
+  {
+    "input": "List all games in <league> in <season> with <event>, and include the number of times the event occurred",
+    "query": "SELECT ht.name AS home_team, at.name AS away_team, g.score, g.date, COUNT(e.id) AS event_count FROM games g JOIN leagues l ON g.league_id = l.id JOIN events e ON g.id = e.game_id AND g.home_team_id = e.team_id JOIN teams ht ON g.home_team_id = ht.id JOIN teams at ON g.away_team_id = at.id WHERE l.name = '<leauge>' AND g.season = '<season>' AND e.label = '<event>' GROUP BY g.id;"
+  },
+  {
+    "input": "What teams and in what season did <player> play in?",
+    "query": "SELECT DISTINCT p.name AS player_name, t.name AS team_name, g.season, l.name AS league_name FROM game_lineup gl JOIN players p ON gl.player_id = p.hash JOIN teams t ON gl.team_id = t.id JOIN games g ON gl.game_id = g.id JOIN leagues l ON g.league_id = l.id WHERE p.name = '<player>' ORDER BY p.name, t.name, g.season, l.name;"
+  },
+  {
+    "input": "List all players in <team> in <season>",
+    "query": "SELECT DISTINCT p.name AS player_name FROM game_lineup gl JOIN players p ON gl.player_id = p.hash JOIN teams t ON gl.team_id = t.id JOIN games g ON gl.game_id = g.id WHERE t.name = '<team>' AND g.season = '<season>' ORDER BY p.name;"
+  },
+  {
+    "input": "List all teams a player has played for",
+    "query": "SELECT DISTINCT t.name AS team_name FROM game_lineup gl JOIN players p ON gl.player_id = p.hash JOIN teams t ON gl.team_id = t.id WHERE p.name = '<player>' ORDER BY t.name;"
+  },
+  {
+    "input": "List all yellow and red cards for game <game_id>, sorted by time",
+    "query": "SELECT p.name AS player_name, pel.label AS card_type, pe.time AS event_time FROM player_events pe JOIN players p ON pe.player_id = p.hash JOIN player_event_labels pel ON pe.type = pel.id WHERE pe.game_id = <game_id> AND (pel.label = 'Yellow card' OR pel.label = 'Red card') ORDER BY CAST(pe.time AS UNSIGNED) ASC;"
+  },
+  {
+    "input": "What player had the first <event> in league <league> in season <season>?",
+    "query": "SELECT p.name AS player_name, pe.game_id, pe.time AS event_time FROM player_events pe JOIN players p ON pe.player_id = p.hash JOIN (SELECT g.id FROM games g JOIN leagues l ON g.league_id = l.id WHERE g.season = '<season>' AND l.id = <leauge_id> ORDER BY g.id LIMIT 1) AS first_game ON pe.game_id = first_game.id JOIN player_event_labels pel ON pe.type = pel.id WHERE pel.label = <event> ORDER BY CAST(pe.time AS UNSIGNED) ASC LIMIT 1;"
+  },
+  {
+    "input": "How many times did <player> get substituted in <season>?",
+    "query": "SELECT COUNT(*) AS substitution_count FROM player_events pe JOIN players p ON pe.player_id = p.hash JOIN games g ON pe.game_id = g.id WHERE p.hash = <player_hash> AND g.season = <season> AND (pe.type = 6 or pe.type = 7)"
+  }
+]

src/database/database.py ADDED Viewed

	@@ -0,0 +1,445 @@

+from sqlalchemy import create_engine, Column, Integer, String, ForeignKey, Text, Float, Boolean, UniqueConstraint
+from sqlalchemy.orm import declarative_base, sessionmaker
+import pandas as pd
+import os
+import json
+engine = create_engine('sqlite:///../../data/games.db', echo=False)
+Base = declarative_base()
+class Game(Base):
+    __tablename__ = 'games'
+    id = Column(Integer, primary_key=True)
+    timestamp = Column(String)
+    score = Column(String)
+    goal_home = Column(Integer)
+    goal_away = Column(Integer)
+    round = Column(String)
+    home_team_id = Column(Integer, ForeignKey('teams.id'))
+    away_team_id = Column(Integer, ForeignKey('teams.id'))
+    venue = Column(String)
+    referee = Column(String)
+    attendance = Column(String)
+    date = Column(String)
+    season = Column(String)
+    league_id = Column(Integer, ForeignKey('leagues.id'))
+class GameLineup(Base):
+    __tablename__ = 'game_lineup'
+    id = Column(Integer, primary_key=True)
+    game_id = Column(Integer, ForeignKey('games.id'))
+    team_id = Column(Integer, ForeignKey('teams.id'))
+    player_id = Column(Integer, ForeignKey('players.hash'))
+    shirt_number = Column(String)
+    position = Column(String)
+    starting = Column(Boolean)
+    captain = Column(Boolean)
+    coach = Column(Boolean)
+    tactics = Column(String)
+    # Add a unique constraint on game_id and player_id
+    __table_args__ = (UniqueConstraint('game_id', 'player_id', name='uc_game_id_player_id'),)
+class Team(Base):
+    __tablename__ = 'teams'
+    id = Column(Integer, primary_key=True)
+    name = Column(String)
+class Player(Base):
+    __tablename__ = 'players'
+    hash = Column(String, primary_key=True)
+    name = Column(String)
+    country = Column(String)
+class Caption(Base):
+    __tablename__ = 'captions'
+    id = Column(Integer, primary_key=True)
+    game_id = Column(Integer, ForeignKey('games.id'))
+    game_time = Column(String)
+    period = Column(Integer)
+    label = Column(String)
+    description = Column(Text)
+    important = Column(Boolean)
+    visibility = Column(Boolean)
+    frame_stamp = Column(Integer)
+class Commentary(Base):
+    __tablename__ = 'commentary'
+    id = Column(Integer, primary_key=True)
+    game_id = Column(Integer, ForeignKey('games.id'))
+    period = Column(Integer)
+    event_time_start = Column(Float)
+    event_time_end = Column(Float)
+    description = Column(Text)
+class League(Base):
+    __tablename__ = 'leagues'
+    id = Column(Integer, primary_key=True)
+    name = Column(String)
+class Event(Base):
+    __tablename__ = 'events'
+    id = Column(Integer, primary_key=True)
+    game_id = Column(Integer, ForeignKey('games.id'))
+    period = Column(Integer)
+    # half = Column(Integer)
+    game_time = Column(Integer)
+    team_id = Column(Integer, ForeignKey('teams.id'))
+    frame_stamp = Column(Integer)
+    label = Column(String)
+    visibility = Column(Boolean)
+class Augmented_Team(Base):
+    __tablename__ = 'augmented_teams'
+    id = Column(Integer, primary_key=True)
+    team_id = Column(Integer, ForeignKey('teams.id'))
+    augmented_name = Column(String)
+class Augmented_League(Base):
+    __tablename__ = 'augmented_leagues'
+    id = Column(Integer, primary_key=True)
+    league_id = Column(Integer, ForeignKey('leagues.id'))
+    augmented_name = Column(String)
+class Player_Event_Label(Base):
+    __tablename__ = 'player_event_labels'
+    id = Column(Integer, primary_key=True)
+    label = Column(String)
+class Player_Event(Base):
+    __tablename__ = 'player_events'
+    id = Column(Integer, primary_key=True)
+    game_id = Column(Integer, ForeignKey('games.id'))
+    player_id = Column(Integer, ForeignKey('players.hash'))
+    time = Column(String) # Time in minutes of the game
+    type = Column(Integer, ForeignKey('player_event_labels.id'))
+    linked_player = Column(Integer, ForeignKey('players.hash')) # If the event is linked to another player, for example a substitution
+# Create Tables
+Base.metadata.create_all(engine)
+# Session setup
+Session = sessionmaker(bind=engine)
+def extract_time_from_player_event(time:str)->str:
+    # Extract the time from the string
+    time = time.split("'")[0] # Need to keep it str because of overtime eg. (45+2)
+    return time
+def get_or_create(session, model, **kwargs):
+    instance = session.query(model).filter_by(**kwargs).first()
+    if instance:
+        return instance
+    else:
+        instance = model(**kwargs)
+        session.add(instance)
+        session.commit()
+        return instance
+def process_game_data(data,data2, league, season):
+    session = Session()
+    # Caption = d and v2 = d2
+    home_team = data["gameHomeTeam"]
+    away_team = data["gameAwayTeam"]
+    score = data["score"]
+    home_score = score[0]
+    away_score = score[-1]
+    round_ = data["round"]
+    venue = data["venue"][0]
+    referee = data.get("referee_found", None)
+    referee = referee[0] if referee else data.get("referee", None)
+    date = data["gameDate"]
+    timestamp = data["timestamp"]
+    attendance = data.get("attendance", [])
+    attendance = attendance[0] if attendance else None
+    home_team = get_or_create(session, Team, name=home_team)
+    away_team = get_or_create(session, Team, name=away_team)
+    # Check if the game already exists
+    game = session.query(Game).filter_by(timestamp=timestamp, home_team_id=home_team.id).first()
+    # Check if league exists
+    league = get_or_create(session, League, name=league)
+    if not game:
+        game = Game(timestamp=timestamp, score=score, goal_home=home_score, goal_away=away_score, round=round_, home_team_id=home_team.id, away_team_id=away_team.id,
+                    venue=venue, date=date, attendance=attendance, season=season, league_id=league.id, referee=referee)
+        session.add(game)
+        session.commit()
+    teams = ["home", "away"]
+    # Lets add lineup data
+    for team in teams:
+        if team == "home":
+            team_id = home_team.id
+        else:
+            team_id = away_team.id
+        team_lineup = data["lineup"][team]
+        tactic = team_lineup["tactic"]
+        for player_data in team_lineup["players"]:
+            player_hash = player_data["hash"]
+            name = player_data["long_name"]
+            if " " not in name: # Since some players are missing their first name, do this to help with the search
+                name = "NULL " + name
+            number = player_data["shirt_number"]
+            captain = player_data["captain"] == "(C)"
+            starting = player_data["starting"]
+            country = player_data["country"]
+            position = player_data["lineup"]
+            facts = player_data.get("facts", None) # Facts might be empty
+            player = get_or_create(session, Player, hash=player_hash, name=name, country=country)
+            game_lineup = GameLineup(game_id=game.id, team_id=team_id, player_id=player.hash,
+                                     shirt_number=number, position=position, starting=starting, captain=captain, coach=False, tactics=tactic)
+            if facts:
+                for fact in facts:
+                    type = fact["type"]
+                    time = extract_time_from_player_event(fact["time"])
+                    event = get_or_create(session, Player_Event_Label, id=int(type))
+                    linked_player = fact.get("linked_player_hash", None)
+                    player_event = Player_Event(game_id=game.id, player_id=player.hash, time=time, type=event.id, linked_player=linked_player)
+                    session.add(player_event)
+            session.add(game_lineup)
+        # Get the coach
+        coach = team_lineup["coach"][0]
+        coach_hash = coach["hash"]
+        coach_name = coach["long_name"]
+        if " " not in coach_name:  # Since some players are missing their first name, do this to help with the search
+            name = "NULL " + coach_name
+        coach_country = coach["country"]
+        coach_player = get_or_create(session, Player, hash=coach_hash, name=coach_name, country=coach_country)
+        game_lineup = GameLineup(game_id=game.id, team_id=team_id, player_id=coach_player.hash,
+                                 shirt_number=None, position=None, starting=None, captain=False, coach=True, tactics=tactic)
+        session.add(game_lineup)
+        # Commit all changes at once
+        session.commit()
+    # Start parsing the events
+    events = data["annotations"]
+    for event in events:
+        period, time = convert_to_seconds(event["gameTime"])
+        label = event["label"]
+        # Renaming labels
+        if label == "soccer-ball":
+            label = "goal"
+        elif label == "y-card":
+            label = "yellow card"
+        elif label == "r-card":
+            label = "red card"
+        description = event["description"]
+        important = event["important"] == "true"
+        visible = event["visibility"]
+        # Convert to boolean
+        # True if shown, False if not
+        visible = visible == "shown"
+        position = int(event["position"])
+        event = Caption(game_id=game.id, game_time=time, period=period, label=label, description=description,
+                        important=important, visibility=visible, frame_stamp=position)
+        session.add(event)
+    session.commit()
+    return game.id, home_team.id, away_team.id
+def process_player_data(data):
+    pass
+def process_ASR_data(data, game_id, period):
+    session = Session()
+    seg = data["segments"]
+    commentary_events = []  # Store the events in a list
+    for k, v in seg.items():
+        start = float(v[0])
+        end = float(v[1])
+        desc = v[2]
+        event = Commentary(game_id=game_id, period=period, event_time_start=start, event_time_end=end, description=desc)
+        commentary_events.append(event)
+    # Bulk save objects
+    session.bulk_save_objects(commentary_events)
+    session.commit()
+    session.close()
+def convert_to_seconds(time_str):
+    # Split the string into its components
+    period, time = time_str.split(" - ")
+    minutes, seconds = time.split(":")
+    # Convert the components to integers
+    period = int(period)
+    minutes = int(minutes)
+    seconds = int(seconds)
+    # Calculate the time in seconds
+    total_seconds = (minutes * 60) + seconds
+    return period, total_seconds
+def parse_labels_v2(data, session, home_team_id, away_team_id, game_id):
+    annotations_data = data["annotations"]
+    no_team = get_or_create(session, Team, name="not applicable")
+    for annotation in annotations_data:
+        period, game_time = convert_to_seconds(annotation["gameTime"])
+        # Determine which team the annotation belongs to
+        if annotation["team"] == "home":
+            team_id = home_team_id
+        elif annotation["team"] == "away":
+            team_id = away_team_id
+        else:
+            team_id = no_team.id
+        position = annotation.get("position", None)  # Assuming position can be null
+        visibility = annotation["visibility"] == "visible"
+        # Convert to boolean
+        # True if visible, False if not
+        visibility = visibility == "visible"
+        label = annotation["label"]
+        # Create and add the Annotations instance
+        annotation_entry = Event(
+            game_id=game_id,
+            period=period,  # periode
+            game_time=game_time,  # Already in seconds
+            frame_stamp=position,  # Make sure this is an integer or None
+            team_id=team_id,  # Integer ID of the team
+            visibility=visibility, # Boolean
+            label=label # String with information
+        )
+        session.add(annotation_entry)
+    session.commit()
+def process_json_files(directory):
+    session = Session()
+    fill_player_events(session)
+    for root, dirs, files in os.walk(directory):
+        print(root)
+        labels_file = None
+        asr_files = []
+        path_parts = root.split("\\")
+        if len(path_parts) > 2:
+            league = path_parts[-3].split("/")[-1]
+            season = path_parts[-2]
+        # Need the labels-v2 first as it contains the game ID
+        for file in files:
+            if 'Labels-caption.json' in file:
+                labels_file = file
+            elif file.endswith('.json'):
+                asr_files.append(file)
+        if labels_file:
+            with open(os.path.join(root, labels_file), 'r') as f:
+                lb_cap = json.load(f)
+            with open(os.path.join(root, "Labels-v2.json"), 'r') as f:
+                lb_v2 = json.load(f)
+            game_id, home_team_id, away_team_id = process_game_data(lb_cap,lb_v2, league, season)
+        for file in asr_files:
+            with open(os.path.join(root, file), 'r') as f:
+                asr = json.load(f)
+            # Determine the type of file and process accordingly
+            if 'Labels-v2' in file:
+                parse_labels_v2(asr, session, home_team_id, away_team_id, game_id)
+            elif '1_half-ASR' in file:
+                period = 1
+                # Parse and commit the data
+                process_ASR_data(data=asr, game_id = game_id, period=period)
+            elif '2_half-ASR' in file:
+                period = 2
+                # Parse and commit the data
+                process_ASR_data(data=asr, game_id = game_id, period=period)
+    session.commit()
+    session.close()
+def fill_player_events(session):
+    fact_id2label = {
+        "1": "Yellow card",
+        # Example: "time": "71' Ivanovic B. (Unsportsmanlike conduct)", "description": "Yellow Card"
+        "2": "Red card",  # Example: "time": "70' Matic N. (Unsportsmanlike conduct)", "description": "Red Card"
+        "3": "Goal",  # Example: "time": "14' Ivanovic B. (Hazard E.)", "description": "Goal"
+        "4": "NA",
+        "5": "NA 2",
+        "6": "Substitution home",  # Example: "time": "72'", "description": "Ramires"
+        "7": "Substitution away",  # Example: "time": "86'", "description": "Filipe Luis"
+        "8": "Assistance"  # Example: "time": "14' Ivanovic B. (Hazard E.)", "description": "Assistance"
+    }
+    for key, value in fact_id2label.items():
+        label = get_or_create(session, Player_Event_Label, label=value)
+    session.commit()
+def fill_Augmented_Team(file_path):
+    df = pd.read_csv(file_path)
+    # the df should have two columns, team_name and augmented_name
+    session = Session()
+    teams = session.query(Team).all()
+    # For each row, find the team_id and add the augmented name
+    for index, row in df.iterrows():
+        team_name = row["name"]
+        augmented_name = row["augmented_name"]
+        # Strip leading and trailing whitespace
+        augmented_name = augmented_name.strip()
+        team = session.query(Team).filter_by(name=team_name).first()
+        if team:
+            augmented_team = get_or_create(session, Augmented_Team, team_id=team.id, augmented_name=augmented_name)
+    session.commit()
+    session.close()
+def fill_Augmented_League(file_path):
+    # Read the csv file
+    df = pd.read_csv(file_path)
+    # the df should have two columns, team_name and augmented_name
+    session = Session()
+    leagues = session.query(League).all()
+    # For each row, find the team_id and add the augmented name
+    for index, row in df.iterrows():
+        league_name = row["name"]
+        augmented_name = row["augmented_name"]
+        # Strip leading and trailing whitespace
+        augmented_name = augmented_name.strip()
+        league = session.query(League).filter_by(name=league_name).first()
+        if league:
+            augmented_league = get_or_create(session, Augmented_League, league_id=league.id, augmented_name=augmented_name)
+    session.commit()
+    session.close()
+if __name__ == "__main__":
+    # Example directory path
+    process_json_files('../../data/Dataset/SN-ASR_captions_and_actions/')
+    fill_Augmented_Team('../../data/Dataset/augmented.csv')
+    fill_Augmented_League('../../data/Dataset/augmented_leauges.csv')
+# Rename the event/annotation table to something more descriptive. Events are fucking everything else over

src/database/readdata.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

src/extractor.py ADDED Viewed

	@@ -0,0 +1,558 @@

+from typing import Optional
+from langchain.chains import create_extraction_chain_pydantic
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains import create_extraction_chain
+from copy import deepcopy
+from langchain_openai import ChatOpenAI
+from langchain_community.utilities import SQLDatabase
+import os
+import difflib
+import ast
+import json
+import re
+from thefuzz import process
+# Set up logging
+import logging
+from dotenv import load_dotenv
+load_dotenv(".env")
+logging.basicConfig(level=logging.INFO)
+# Save the log to a file
+handler = logging.FileHandler('extractor.log')
+logger = logging.getLogger(__name__)
+os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
+# os.environ["ANTHROPIC_API_KEY"] = os.getenv('ANTHROPIC_API_KEY')
+if os.getenv('LANGSMITH'):
+    os.environ['LANGCHAIN_TRACING_V2'] = 'true'
+    os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
+    os.environ[
+        'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
+    os.environ['LANGCHAIN_PROJECT'] = 'master-theses'
+db = SQLDatabase.from_uri("sqlite:///data/games.db")
+# from langchain_anthropic import ChatAnthropic
+class Extractor():
+    # llm = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
+    #gpt-3.5-turbo
+    def __init__(self, model="gpt-3.5-turbo-0125", schema_config=None, custom_extractor_prompt=None):
+        # model = "gpt-4-0125-preview"
+        if custom_extractor_prompt:
+            cust_promt = ChatPromptTemplate.from_template(custom_extractor_prompt)
+        self.llm = ChatOpenAI(model=model, temperature=0)
+        # self.llm = ChatAnthropic(model="claude-3-opus-20240229", temperature=0)
+        self.schema = schema_config or {}
+        self.chain = create_extraction_chain(self.schema, self.llm, prompt=cust_promt)
+    def extract(self, query):
+        return self.chain.invoke(query)
+class Retriever():
+    def __init__(self, db, config):
+        self.db = db
+        self.config = config
+        self.table = config.get('db_table')
+        self.column = config.get('db_column')
+        self.pk_column = config.get('pk_column')
+        self.numeric = config.get('numeric', False)
+        self.response = []
+        self.query = f"SELECT {self.column} FROM {self.table}"
+        self.augmented_table = config.get('augmented_table', None)
+        self.augmented_column = config.get('augmented_column', None)
+        self.augmented_fk = config.get('augmented_fk', None)
+    def query_as_list(self):
+        # Execute the query
+        response = self.db.run(self.query)
+        response = [el for sub in ast.literal_eval(response) for el in sub if el]
+        if not self.numeric:
+            response = [re.sub(r"\b\d+\b", "", string).strip() for string in response]
+        self.response = list(set(response))
+        # print(self.response)
+        return self.response
+    def get_augmented_items(self, prompt):
+        if self.augmented_table is None:
+            return None
+        else:
+            # Construct the query to search for the prompt in the augmented table
+            query = f"SELECT {self.augmented_fk} FROM {self.augmented_table} WHERE LOWER({self.augmented_column}) = LOWER('{prompt}')"
+            # Execute the query
+            fk_response = self.db.run(query)
+            if fk_response:
+                # Extract the FK value
+                fk_response = ast.literal_eval(fk_response)
+                fk_value = fk_response[0][0]
+                query = f"SELECT {self.column} FROM {self.table} WHERE {self.pk_column} = {fk_value}"
+                # Execute the query
+                matching_response = self.db.run(query)
+                # Extract the matching response
+                matching_response = ast.literal_eval(matching_response)
+                matching_response = matching_response[0][0]
+                return matching_response
+            else:
+                return None
+    def find_close_matches(self, target_string, n=3, method="difflib", threshold=70):
+        """
+        Find and return the top n close matches to target_string in the database query results.
+        Args:
+        - target_string (str): The string to match against the database results.
+        - n (int): Number of top matches to return.
+        Returns:
+        - list of tuples: Each tuple contains a match and its score.
+        """
+        # Ensure we have the response list populated
+        if not self.response:
+            self.query_as_list()
+        # Find top n close matches
+        if method == "fuzzy":
+            # Use the fuzzy_string method to get matches and their scores
+            # If the threshold is met, return the best match; otherwise, return all matches meeting the threshold
+            top_matches = self.fuzzy_string(target_string, limit=n, threshold=threshold)
+        else:
+            # Use difflib's get_close_matches to get the top n matches
+            top_matches = difflib.get_close_matches(target_string, self.response, n=n, cutoff=0.2)
+        return top_matches
+    def fuzzy_string(self, prompt, limit, threshold=80, low_threshold=30):
+        # Get matches and their scores, limited by the specified 'limit'
+        matches = process.extract(prompt, self.response, limit=limit)
+        filtered_matches = [match for match in matches if match[1] >= threshold]
+        # If no matches meet the threshold, return the list of all matches' strings
+        if not filtered_matches:
+            # Return matches above the low_threshold
+            # Fix for wrong properties being returned
+            return [match[0] for match in matches if match[1] >= low_threshold]
+        # If there's only one match meeting the threshold, return it as a string
+        if len(filtered_matches) == 1:
+            return filtered_matches[0][0]  # Return the matched string directly
+        # If there's more than one match meeting the threshold or ties, return the list of matches' strings
+        highest_score = filtered_matches[0][1]
+        ties = [match for match in filtered_matches if match[1] == highest_score]
+        # Return the strings of tied matches directly, ignoring the scores
+        m = [match[0] for match in ties]
+        if len(m) == 1:
+            return m[0]
+        return [match[0] for match in ties]
+    def fetch_pk(self, property_name, property_value):
+        # Some properties do not have a primary key
+        # Return the property value if no primary key is specified
+        pk_list = []
+        # Check if the property_value is a list; if not, make it a list for uniform processing
+        if not isinstance(property_value, list):
+            property_value = [property_value]
+        # Some properties do not have a primary key
+        # Return None for each property_value if no primary key is specified
+        if self.pk_column is None:
+            return [None for _ in property_value]
+        for value in property_value:
+            query = f"SELECT {self.pk_column} FROM {self.table} WHERE {self.column} = '{value}' LIMIT 1"
+            response = self.db.run(query)
+            # Append the response (PK or None) to the pk_list
+            pk_list.append(response)
+        return pk_list
+def setup_retrievers(db, schema_config):
+    # retrievers = {}
+    # for prop, config in schema_config["properties"].items():
+    #     retrievers[prop] = Retriever(db=db, config=config)
+    # return retrievers
+    retrievers = {}
+    # Iterate over each property in the schema_config's properties
+    for prop, config in schema_config["properties"].items():
+        # Access the 'items' dictionary for the configuration of the array's elements
+        item_config = config['items']
+        # Create a Retriever instance using the item_config
+        retrievers[prop] = Retriever(db=db, config=item_config)
+    return retrievers
+def extract_properties(prompt, schema_config, custom_extractor_prompt=None):
+    """Extract properties from the prompt."""
+    # modify schema_conf to only include the required properties
+    schema_stripped = {'properties': {}}
+    for key, value in schema_config['properties'].items():
+        schema_stripped['properties'][key] = {
+            'type': value['type'],
+            'items': {'type': value['items']['type']}
+        }
+    extractor = Extractor(schema_config=schema_stripped, custom_extractor_prompt=custom_extractor_prompt)
+    extraction_result = extractor.extract(prompt)
+    # print("Extraction Result:", extraction_result)
+    if 'text' in extraction_result and extraction_result['text']:
+        properties = extraction_result['text']
+        return properties
+    else:
+        print("No properties extracted.")
+        return None
+def recheck_property_value(properties, property_name, retrievers, input_func):
+    while True:
+        new_value = input_func(f"Enter new value for {property_name} or type 'quit' to stop: ")
+        if new_value.lower() == 'quit':
+            break  # Exit the loop and do not update the property
+        new_top_matches = retrievers[property_name].find_close_matches(new_value, n=3)
+        if new_top_matches:
+            # Display new top matches and ask for confirmation or re-entry
+            print("\nNew close matches found:")
+            for i, match in enumerate(new_top_matches, start=1):
+                print(f"[{i}] {match}")
+            print("[4] Re-enter value")
+            print("[5] Quit without updating")
+            selection = input_func("Select the best match (1-3), choose 4 to re-enter value, or 5 to quit: ")
+            if selection in ['1', '2', '3']:
+                selected_match = new_top_matches[int(selection) - 1]
+                properties[property_name] = selected_match  # Update the dictionary directly
+                print(f"Updated {property_name} to {selected_match}")
+                break  # Successfully updated, exit the loop
+            elif selection == '5':
+                break  # Quit without updating
+            # Loop will continue if user selects 4 or inputs invalid selection
+        else:
+            print("No close matches found. Please try again or type 'quit' to stop.")
+def check_and_update_properties(properties_list, retrievers, method="fuzzy", input_func=input):
+    """
+    Checks and updates the properties in the properties list based on close matches found in the database.
+    The function iterates through each property in each property dictionary within the list,
+    finds close matches for it in the database using the retrievers, and updates the property
+    value based on user selection.
+    Args:
+        properties_list (list of dict): A list of dictionaries, where each dictionary contains properties
+            to check and potentially update based on database matches.
+        retrievers (dict): A dictionary of Retriever objects keyed by property name, used to find close matches in the database.
+        input_func (function, optional): A function to capture user input. Defaults to the built-in input function.
+    The function updates the properties_list in place based on user choices for updating property values
+    with close matches found by the retrievers.
+    """
+    for index, properties in enumerate(properties_list):
+        for property_name, retriever in retrievers.items():  # Iterate using items to get both key and value
+            property_values = properties.get(property_name, [])
+            if not property_values:  # Skip if the property is not present or is an empty list
+                continue
+            updated_property_values = []  # To store updated list of values
+            for value in property_values:
+                if retriever.augmented_table:
+                    augmented_value = retriever.get_augmented_items(value)
+                    if augmented_value:
+                        updated_property_values.append(augmented_value)
+                        continue
+                # Since property_value is now expected to be a list, we handle each value individually
+                top_matches = retriever.find_close_matches(value, method=method, n=3)
+                # Check if the closest match is the same as the current value
+                if top_matches and top_matches[0] == value:
+                    updated_property_values.append(value)
+                    continue
+                if not top_matches:
+                    updated_property_values.append(value)  # Keep the original value if no matches found
+                    continue
+                if type(top_matches) == str and method == "fuzzy":
+                    # If the top_matches is a string, it means that the threshold was met and only one item was returned
+                    # In this case, we can directly update the property with the top match
+                    updated_property_values.append(top_matches)
+                    properties[property_name] = updated_property_values
+                    continue
+                print(f"\nCurrent {property_name}: {value}")
+                for i, match in enumerate(top_matches, start=1):
+                    print(f"[{i}] {match}")
+                print("[4] Enter new value")
+                # hmm = input_func(f"Fix for Pycharm, press enter to continue")
+                choice = input_func(f"Select the best match for {property_name} (1-4): ")
+                if choice in ['1', '2', '3']:
+                    selected_match = top_matches[int(choice) - 1]
+                    updated_property_values.append(selected_match)  # Update with the selected match
+                    print(f"Updated {property_name} to {selected_match}")
+                elif choice == '4':
+                    # Allow re-entry of value for this specific item
+                    recheck_property_value(properties, property_name, value, retrievers, input_func)
+                    # Note: Implement recheck_property_value to handle individual value updates within the list
+                else:
+                    print("Invalid selection. Property not updated.")
+                    updated_property_values.append(value)  # Keep the original value
+            # Update the entire list for the property after processing all values
+            properties[property_name] = updated_property_values
+# Function to remove duplicates
+def remove_duplicates(dicts):
+    seen = {}  # Dictionary to keep track of seen values for each key
+    for d in dicts:
+        for key in list(d.keys()):  # Use list to avoid RuntimeError for changing dict size during iteration
+            value = d[key]
+            if key in seen and value == seen[key]:
+                del d[key]  # Remove key-value pair if duplicate is found
+            else:
+                seen[key] = value  # Update seen values for this key
+    return dicts
+def fetch_pks(properties_list, retrievers):
+    all_pk_attributes = []  # Initialize a list to store dictionaries of _pk attributes for each item in properties_list
+    # Iterate through each properties dictionary in the list
+    for properties in properties_list:
+        pk_attributes = {}  # Initialize a dictionary for the current set of properties
+        for property_name, property_value in properties.items():
+            if property_name in retrievers:
+                # Fetch the primary key using the retriever for the current property
+                pk = retrievers[property_name].fetch_pk(property_name, property_value)
+                # Store it in the dictionary with a modified key name
+                pk_attributes[f"{property_name}_pk"] = pk
+        # Add the dictionary of _pk attributes for the current set of properties to the list
+        all_pk_attributes.append(pk_attributes)
+    # Return a list of dictionaries, where each dictionary contains _pk attributes for a set of properties
+    return all_pk_attributes
+def update_prompt(prompt, properties, pk, properties_original):
+    # Replace the original prompt with the updated properties and pk
+    prompt = prompt.replace("{{properties}}", str(properties))
+    prompt = prompt.replace("{{pk}}", str(pk))
+    return prompt
+def update_prompt_enhanced(prompt, properties, pk, properties_original):
+    updated_info = ""
+    for prop, pk_info, prop_orig in zip(properties, pk, properties_original):
+        for key in prop.keys():
+            # Extract original and updated values
+            orig_values = prop_orig.get(key, [])
+            updated_values = prop.get(key, [])
+            # Ensure both original and updated values are lists for uniform processing
+            if not isinstance(orig_values, list):
+                orig_values = [orig_values]
+            if not isinstance(updated_values, list):
+                updated_values = [updated_values]
+            # Extract primary key detail for this key, handling various pk formats carefully
+            pk_key = f"{key}_pk"  # Construct pk key name based on the property key
+            pk_details = pk_info.get(pk_key, [])
+            if not isinstance(pk_details, list):
+                pk_details = [pk_details]
+            for orig_value, updated_value, pk_detail in zip(orig_values, updated_values, pk_details):
+                pk_value = None
+                if isinstance(pk_detail, str):
+                    pk_value = pk_detail.strip("[]()").split(",")[0].replace("'", "").replace('"', '')
+                update_statement = ""
+                # Skip updating if there's no change in value to avoid redundant info
+                if orig_value != updated_value and pk_value:
+                    update_statement = f"\n- {orig_value} (now referred to as {updated_value}) has a primary key: {pk_value}."
+                elif orig_value != updated_value:
+                    update_statement = f"\n- {orig_value} (now referred to as {updated_value})."
+                elif pk_value:
+                    update_statement = f"\n- {orig_value} has a primary key: {pk_value}."
+                updated_info += update_statement
+    if updated_info:
+        prompt += "\nUpdated Information:" + updated_info
+    return prompt
+def prompt_cleaner(prompt, db, schema_config):
+    """Main function to clean the prompt."""
+    retrievers = setup_retrievers(db, schema_config)
+    properties = extract_properties(prompt, schema_config)
+    # Keep original properties for later use
+    properties_original = deepcopy(properties)
+    # Remove duplicates - Happens when there are more than one player or team in the prompt
+    properties = remove_duplicates(properties)
+    if properties:
+        check_and_update_properties(properties, retrievers)
+        pk = fetch_pks(properties, retrievers)
+    properties = update_prompt_enhanced(prompt, properties, pk, properties_original)
+    return properties, pk
+class PromptCleaner:
+    """
+    A class designed to clean and process prompts by extracting properties, removing duplicates,
+    and updating these properties based on a predefined schema configuration and database interactions.
+    Attributes:
+        db: A database connection object used to execute queries and fetch data.
+        schema_config: A dictionary defining the schema configuration for the extraction process.
+        schema_config = {
+            "properties": {
+                # Property name
+                "person_name": {"type": "string", "db_table": "players", "db_column": "name", "pk_column": "hash",
+                                    # if mostly numeric, such as 2015-2016 set true
+                                "numeric": False},
+                "team_name": {"type": "string", "db_table": "teams", "db_column": "name", "pk_column": "id",
+                              "numeric": False},
+                              # Add more as needed
+            },
+            # Parameter to extractor, if person_name is required, add it here and the extractor will
+            # return an error if it is not found
+            "required": [],
+        }
+    Methods:
+        clean(prompt): Cleans the given prompt by extracting and updating properties based on the database.
+            Returns a tuple containing the updated properties and their primary keys.
+    """
+    def __init__(self, db=db, schema_config=None, custom_extractor_prompt=None):
+        """
+        Initializes the PromptCleaner with a database connection and a schema configuration.
+        Args:
+            db: The database connection object to be used for querying. (if none, it will use the default db)
+            schema_config: A dictionary defining properties and their database mappings for extraction and updating.
+        """
+        self.db = db
+        self.schema_config = schema_config
+        self.retrievers = setup_retrievers(self.db, self.schema_config)
+        self.cust_extractor_prompt = custom_extractor_prompt
+    def clean(self, prompt, return_pk=False, test=False, verbose = False):
+        """
+        Processes the given prompt to extract properties, remove duplicates, update the properties
+        based on close matches within the database, and fetch primary keys for these properties.
+        The method first extracts properties from the prompt using the schema configuration,
+        then checks these properties against the database to find and update close matches.
+        It also fetches primary keys for the updated properties where applicable.
+        Args:
+            prompt (str): The prompt text to be cleaned and processed.
+            return_pk (bool): A flag to indicate whether to return primary keys along with the properties.
+            test (bool): A flag to indicate whether to return the original properties for testing purposes.
+            verbose (bool): A flag to indicate whether to return the original properties for debugging.
+        Returns:
+            tuple: A tuple containing two elements:
+                - The first element is the original prompt, with updated information that excist in the db.
+                - The second element is a list of dictionaries, each containing primary keys for the properties,
+                  where applicable.
+        """
+        if self.cust_extractor_prompt:
+            properties = extract_properties(prompt, self.schema_config, self.cust_extractor_prompt)
+        else:
+            properties = extract_properties(prompt, self.schema_config)
+        # Keep original properties for later use
+        properties_original = deepcopy(properties)
+        if test:
+            return properties_original
+        # Remove duplicates - Happens when there are more than one player or team in the prompt
+        # properties = remove_duplicates(properties)
+        pk = None
+        if properties:
+            check_and_update_properties(properties, self.retrievers)
+            pk = fetch_pks(properties, self.retrievers)
+        properties = update_prompt_enhanced(prompt, properties, pk, properties_original)
+        if return_pk:
+            return properties, pk
+        elif verbose:
+            return properties, properties_original
+        else:
+            return properties
+def load_json(file_path: str) -> dict:
+    with open(file_path, 'r') as file:
+        return json.load(file)
+def create_extractor(schema: str = "src/conf/schema.json", db: SQLDatabase = "sqlite:///data/games.db", ):
+    schema_config = load_json(schema)
+    db = SQLDatabase.from_uri(db)
+    pre_prompt = """Extract and save the relevant entities mentioned \
+                    in the following passage together with their properties.
+                    Only extract the properties mentioned in the 'information_extraction' function.
+                    The questions are soccer related. game_event are things like yellow cards, goals, assists, freekick ect.
+                    Generic properties like, "description", "home team", "away team", "game" ect should NOT be extracted.
+                    If a property is not present and is not required in the function parameters, do not include it in the output.
+                    If no properties are found, return an empty list.
+                    Here are some exampels:
+                    'How many goals did Henry score for Arsnl in the 2015 season?'
+                    person_name': ['Henry'], 'team_name': [Arsnl],'year_season': ['2015'],
+                    Passage:
+                    {input}
+    """
+    return PromptCleaner(db, schema_config, custom_extractor_prompt=pre_prompt)
+if __name__ == "__main__":
+    schema_config = load_json("src/conf/schema.json")
+    # Add game and league to the schema_config
+    # prompter = PromptCleaner(db, schema_config, custom_extractor_prompt=extract_prompt)
+    prompter = create_extractor("src/conf/schema.json", "sqlite:///data/games.db")
+    prompt= prompter.clean("Give me goals, shots on target, shots off target and corners from the game between ManU and Swansa")
+    print(prompt)

src/sql_chain.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import logging
+import json
+import os
+from langchain_community.vectorstores import FAISS
+from langchain_core.example_selectors import SemanticSimilarityExampleSelector
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
+from langchain_community.agent_toolkits import create_sql_agent
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    FewShotPromptTemplate,
+    MessagesPlaceholder,
+    PromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain_community.utilities import SQLDatabase
+from dotenv import load_dotenv
+load_dotenv(".env")
+logging.basicConfig(level=logging.INFO)
+# Save the log to a file
+handler = logging.FileHandler('extractor.log')
+logger = logging.getLogger(__name__)
+os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
+if os.getenv('LANGSMITH'):
+    os.environ['LANGCHAIN_TRACING_V2'] = 'true'
+    os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
+    os.environ[
+        'LANGCHAIN_API_KEY'] = os.getenv("LANGSMITH_API_KEY")
+    os.environ['LANGCHAIN_PROJECT'] = 'master-theses'
+def load_json(file_path: str) -> dict:
+    with open(file_path, 'r') as file:
+        return json.load(file)
+class SqlChain:
+    def __init__(self, few_shot_prompts: str, llm_model="gpt-3.5-turbo", db_uri="sqlite:///data/games.db", few_shot_k=2, verbose=True):
+        self.llm = ChatOpenAI(model=llm_model, temperature=0)
+        self.db = SQLDatabase.from_uri(db_uri)
+        self.few_shot_k = few_shot_k
+        self.few_shot = self._set_up_few_shot_prompts(load_json(few_shot_prompts))
+        self.full_prompt = None
+        self.agent = create_sql_agent(
+            llm=self.llm,
+            db=self.db,
+            prompt=self.full_prompt,
+            max_iterations=10,
+            verbose=verbose,
+            agent_type="openai-tools",
+            # Default to 10 examples - Can be overwritten with the prompt
+            top_k=30,
+        )
+    def _set_up_few_shot_prompts(self, few_shot_prompts: dict) -> None:
+        few_shots = SemanticSimilarityExampleSelector.from_examples(
+            few_shot_prompts,
+            OpenAIEmbeddings(),
+            FAISS,
+            k=self.few_shot_k,
+            input_keys=["input"],
+        )
+        return few_shots
+    def few_prompt_construct(self, query: str, top_k=5, dialect="SQLite") -> str:
+        system_prefix = """You are an agent designed to interact with a SQL database.
+        Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
+        ALWAYS query the database before returning an answer.
+        Unless the user specifies a specific number of examples they wish to obtain, always limit your query to at most {top_k} results.
+        You can order the results by a relevant column to return the most interesting examples in the database.
+        Never query for all the columns from a specific table, only ask for the relevant columns given the question.
+        You have access to tools for interacting with the database.
+        Only use the given tools. Only use the information returned by the tools to construct your final answer.
+        You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
+        DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.
+        If the question does not seem related to the database, just return 'I don't know' as the answer.
+        DO NOT include information that is not present in the database in your answer.
+        Here are some examples of user inputs and their corresponding SQL queries. They are tested and works.
+        Use them as a guide when creating your own queries:"""
+        SUFFIX = """Begin!
+            Question: {input}
+            Thought: I should look at the tables in the database to see what I can query.  Then I should query the schema of the most relevant tables.
+            I will not stop until I query the database and return the answer.
+            {agent_scratchpad}"""
+        few_shot_prompt = FewShotPromptTemplate(
+            example_selector=self.few_shot,
+            example_prompt=PromptTemplate.from_template(
+                "User input: {input}\nSQL query: {query}"
+            ),
+            input_variables=["input", "dialect", "top_k"],
+            prefix=system_prefix,
+            suffix=SUFFIX,
+        )
+        full_prompt = ChatPromptTemplate.from_messages(
+            [
+                SystemMessagePromptTemplate(prompt=few_shot_prompt),
+                ("human", "{input}"),
+                MessagesPlaceholder("agent_scratchpad"),
+            ]
+        )
+        self.full_prompt = full_prompt.invoke(
+            {
+                "input": query,
+                "top_k": top_k,
+                "dialect": dialect,
+                "agent_scratchpad": [],
+            }
+        )
+    def prompt_no_few_shot(self, query: str, dialect="SQLite") -> str:
+        system_prefix = """You are an agent designed to interact with a SQL database.
+        Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
+        Unless the user specifies a specific number of examples they wish to obtain, always limit your query to at most {top_k} results.
+        You can order the results by a relevant column to return the most interesting examples in the database.
+        Never query for all the columns from a specific table, only ask for the relevant columns given the question.
+        You have access to tools for interacting with the database.
+        Only use the given tools. Only use the information returned by the tools to construct your final answer.
+        You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
+        DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.
+        If the question does not seem related to the database, just return 'I don't know' as the answer.
+        DO NOT include information that is not present in the database in your answer."""
+        return f"{system_prefix}\n{query}"
+    def ask(self, query: str, few_prompt:bool=True) -> str:
+        if few_prompt:
+            self.few_prompt_construct(query)
+            return self.agent.invoke({"input": self.full_prompt}), self.full_prompt
+        else:
+            return self.agent.invoke(self.prompt_no_few_shot(query)), self.prompt_no_few_shot(query)
+def create_agent(few_shot_prompts: str = "src/conf/sqls.json", llm_model="gpt-3.5-turbo-0125",
+                 db_uri="sqlite:///data/games.db", few_shot_k=2, verbose=True):
+    """ Create an agent with the given few_shot_prompts, llm_model and db_uri
+     Call it with agent.ask(prompt)"""
+    return SqlChain(few_shot_prompts, llm_model, db_uri, few_shot_k, verbose)
+if __name__ == "__main__":
+    chain = SqlChain("src/conf/sqls.json")
+    chain.ask("Is Manchester United in the database?", False)