Spaces:
Running
Running
File size: 4,633 Bytes
b40e563 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import io
import boto3
import requests
import numpy as np
import polars as pl
from PIL import Image
from botocore.config import Config
import logging
logger = logging.getLogger(__name__)
# S3 for sample images
my_config = Config(
region_name='us-east-1'
)
s3_client = boto3.client('s3', config=my_config)
# Set basepath for EOL pages for info
EOL_URL = "https://eol.org/pages/"
RANKS = ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
def get_sample(df, pred_taxon, rank):
'''
Function to retrieve a sample image of the predicted taxon and EOL page link for more info.
Parameters:
-----------
df : DataFrame
DataFrame with all sample images listed and their filepaths (in "file_path" column).
pred_taxon : str
Predicted taxon of the uploaded image.
rank : int
Index of rank in RANKS chosen for prediction.
Returns:
--------
img : PIL.Image
Sample image of predicted taxon for display.
eol_page : str
URL to EOL page for the taxon (may be a lower rank, e.g., species sample).
'''
logger.info(f"Getting sample for taxon: {pred_taxon} at rank: {rank}")
try:
filepath, eol_page_id, full_name, is_exact = get_sample_data(df, pred_taxon, rank)
except Exception as e:
logger.error(f"Error retrieving sample data: {e}")
return None, f"We encountered the following error trying to retrieve a sample image: {e}."
if filepath is None:
logger.warning(f"No sample image found for taxon: {pred_taxon}")
return None, f"Sorry, our EOL images do not include {pred_taxon}."
# Get sample image of selected individual
try:
img_src = s3_client.generate_presigned_url('get_object',
Params={'Bucket': 'treeoflife-10m-sample-images',
'Key': filepath}
)
img_resp = requests.get(img_src)
img = Image.open(io.BytesIO(img_resp.content))
full_eol_url = EOL_URL + eol_page_id
if is_exact:
eol_page = f"<p>Check out the EOL entry for {pred_taxon} to learn more: <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
else:
eol_page = f"<p>Check out an example EOL entry within {pred_taxon} to learn more: {full_name} <a href={full_eol_url} target='_blank'>{full_eol_url}</a>.</p>"
logger.info(f"Successfully retrieved sample image and EOL page for {pred_taxon}")
return img, eol_page
except Exception as e:
logger.error(f"Error retrieving sample image: {e}")
return None, f"We encountered the following error trying to retrieve a sample image: {e}."
def get_sample_data(df, pred_taxon, rank):
'''
Function to randomly select a sample individual of the given taxon and provide associated native location.
Parameters:
-----------
df : DataFrame
DataFrame with all sample images listed and their filepaths (in "file_path" column).
pred_taxon : str
Predicted taxon of the uploaded image.
rank : int
Index of rank in RANKS chosen for prediction.
Returns:
--------
filepath : str
Filepath of selected sample image for predicted taxon.
eol_page_id : str
EOL page ID associated with predicted taxon for more information.
full_name : str
Full taxonomic name of the selected sample.
is_exact : bool
Flag indicating if the match is exact (i.e., with empty lower ranks).
'''
for idx in range(rank + 1):
taxon = RANKS[idx]
target_taxon = pred_taxon.split(" ")[idx]
df = df.filter(pl.col(taxon) == target_taxon)
if df.shape[0] == 0:
return None, np.nan, "", False
# First, try to find entries with empty lower ranks
exact_df = df
for lower_rank in RANKS[rank + 1:]:
exact_df = exact_df.filter((pl.col(lower_rank).is_null()) | (pl.col(lower_rank) == ""))
if exact_df.shape[0] > 0:
df_filtered = exact_df.sample()
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0))
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, True
# If no exact matches, return any entry with the specified rank
df_filtered = df.sample()
full_name = " ".join(df_filtered.select(RANKS[:rank+1]).row(0)) + " " + " ".join(df_filtered.select(RANKS[rank+1:]).row(0))
return df_filtered["file_path"][0], df_filtered["eol_page_id"].cast(pl.String)[0], full_name, False
|