shakesbeardz
Update .gitattributes to track large and binary files with Git LFS
b40e563
import io
import boto3
import requests
import numpy as np
import polars as pl
from PIL import Image
from botocore.config import Config
import logging
logger = logging.getLogger(__name__)
# S3 for sample images
my_config = Config(
region_name='us-east-1'
)
s3_client = boto3.client('s3', config=my_config)
# Set basepath for EOL pages for info
EOL_URL = "https://eol.org/pages/"
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
def get_sample(df, pred_taxon, rank):
'''
Function to retrieve a sample image of the predicted taxon and EOL page link for more info.
Parameters:
-----------
df : DataFrame
DataFrame with all sample images listed and their filepaths (in "file_path" column).
pred_taxon : str
Predicted taxon of the uploaded image.
rank : int
Index of rank in RANKS chosen for prediction.
Returns:
--------
img : PIL.Image
Sample image of predicted taxon for display.
eol_page : str
URL to EOL page for the taxon (may be a lower rank, e.g., species sample).
'''
logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
try:
filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
except Exception as e:
logger.error(f"Error retrieving sample data: {e}")
return None, f"We encountered the following error trying to retrieve a sample image: {e}."
if filepath is None:
logger.warning(f"No sample image found for taxon: {pred_taxon}")
return None, f"Sorry, our EOL images do not include {pred_taxon}."
# Get sample image of selected individual
try:
img_src = s3_client.generate_presigned_url('get_object',
Params={'Bucket': 'treeoflife-10m-sample-images',
'Key': filepath}
)
img_resp = requests.get(img_src)
img = Image.open(io.BytesIO(img_resp.content))
full_eol_url = EOL_URL + eol_page_id
if is_exact:
eol_page = f"<p>Check out the EOL entry for {pred_taxon} to learn more: <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
else:
eol_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: {full_name} <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}")
return img, eol_page
except Exception as e:
logger.error(f"Error retrieving sample image: {e}")
return None, f"We encountered the following error trying to retrieve a sample image: {e}."
def get_sample_data(df, pred_taxon, rank):
'''
Function to randomly select a sample individual of the given taxon and provide associated native location.
Parameters:
-----------
df : DataFrame
DataFrame with all sample images listed and their filepaths (in "file_path" column).
pred_taxon : str
Predicted taxon of the uploaded image.
rank : int
Index of rank in RANKS chosen for prediction.
Returns:
--------
filepath : str
Filepath of selected sample image for predicted taxon.
eol_page_id : str
EOL page ID associated with predicted taxon for more information.
full_name : str
Full taxonomic name of the selected sample.
is_exact : bool
Flag indicating if the match is exact (i.e., with empty lower ranks).
'''
for idx in range(rank + 1):
taxon = RANKS[idx]
target_taxon = pred_taxon.split(" ")[idx]
df = df.filter(pl.col(taxon) == target_taxon)
if df.shape[0] == 0:
return None, np.nan, "", False
# First, try to find entries with empty lower ranks
exact_df = df
for lower_rank in RANKS[rank + 1:]:
exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))
if exact_df.shape[0] > 0:
df_filtered = exact_df.sample()
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True
# If no exact matches, return any entry with the specified rank
df_filtered = df.sample()
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False