Spaces:
Sleeping
Sleeping
import io | |
import boto3 | |
import requests | |
import numpy as np | |
import polars as pl | |
from PIL import Image | |
from botocore.config import Config | |
import logging | |
logger = logging.getLogger(__name__) | |
# S3 for sample images | |
my_config = Config( | |
region_name='us-east-1' | |
) | |
s3_client = boto3.client('s3', config=my_config) | |
# Set basepath for EOL pages for info | |
EOL_URL = "https://eol.org/pages/" | |
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"] | |
def get_sample(df, pred_taxon, rank): | |
''' | |
Function to retrieve a sample image of the predicted taxon and EOL page link for more info. | |
Parameters: | |
----------- | |
df : DataFrame | |
DataFrame with all sample images listed and their filepaths (in "file_path" column). | |
pred_taxon : str | |
Predicted taxon of the uploaded image. | |
rank : int | |
Index of rank in RANKS chosen for prediction. | |
Returns: | |
-------- | |
img : PIL.Image | |
Sample image of predicted taxon for display. | |
eol_page : str | |
URL to EOL page for the taxon (may be a lower rank, e.g., species sample). | |
''' | |
logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}") | |
try: | |
filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank) | |
except Exception as e: | |
logger.error(f"Error retrieving sample data: {e}") | |
return None, f"We encountered the following error trying to retrieve a sample image: {e}." | |
if filepath is None: | |
logger.warning(f"No sample image found for taxon: {pred_taxon}") | |
return None, f"Sorry, our EOL images do not include {pred_taxon}." | |
# Get sample image of selected individual | |
try: | |
img_src = s3_client.generate_presigned_url('get_object', | |
Params={'Bucket': 'treeoflife-10m-sample-images', | |
'Key': filepath} | |
) | |
img_resp = requests.get(img_src) | |
img = Image.open(io.BytesIO(img_resp.content)) | |
full_eol_url = EOL_URL + eol_page_id | |
if is_exact: | |
eol_page = f"<p>Check out the EOL entry for {pred_taxon} to learn more: <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>" | |
else: | |
eol_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: {full_name} <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>" | |
logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}") | |
return img, eol_page | |
except Exception as e: | |
logger.error(f"Error retrieving sample image: {e}") | |
return None, f"We encountered the following error trying to retrieve a sample image: {e}." | |
def get_sample_data(df, pred_taxon, rank): | |
''' | |
Function to randomly select a sample individual of the given taxon and provide associated native location. | |
Parameters: | |
----------- | |
df : DataFrame | |
DataFrame with all sample images listed and their filepaths (in "file_path" column). | |
pred_taxon : str | |
Predicted taxon of the uploaded image. | |
rank : int | |
Index of rank in RANKS chosen for prediction. | |
Returns: | |
-------- | |
filepath : str | |
Filepath of selected sample image for predicted taxon. | |
eol_page_id : str | |
EOL page ID associated with predicted taxon for more information. | |
full_name : str | |
Full taxonomic name of the selected sample. | |
is_exact : bool | |
Flag indicating if the match is exact (i.e., with empty lower ranks). | |
''' | |
for idx in range(rank + 1): | |
taxon = RANKS[idx] | |
target_taxon = pred_taxon.split(" ")[idx] | |
df = df.filter(pl.col(taxon) == target_taxon) | |
if df.shape[0] == 0: | |
return None, np.nan, "", False | |
# First, try to find entries with empty lower ranks | |
exact_df = df | |
for lower_rank in RANKS[rank + 1:]: | |
exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == "")) | |
if exact_df.shape[0] > 0: | |
df_filtered = exact_df.sample() | |
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) | |
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True | |
# If no exact matches, return any entry with the specified rank | |
df_filtered = df.sample() | |
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0)) | |
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False | |