barc_gradio / src /configs /dog_breeds /dog_breed_class.py
Nadine Rueegg
initial commit for barc
7629b39
raw
history blame
8.1 kB
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import difflib
import json
import pickle as pkl
import csv
import numpy as np
# ----------------------------------------------------------------------------------------------------------------- #
class DogBreed(object):
def __init__(self, abbrev, name_akc=None, name_stanext=None, name_xlsx=None, path_akc=None, path_stanext=None, ind_in_xlsx=None, ind_in_xlsx_matrix=None, ind_in_stanext=None, clade=None):
self._abbrev = abbrev
self._name_xlsx = name_xlsx
self._name_akc = name_akc
self._name_stanext = name_stanext
self._path_stanext = path_stanext
self._additional_names = set()
if self._name_akc is not None:
self.add_akc_info(name_akc, path_akc)
if self._name_stanext is not None:
self.add_stanext_info(name_stanext, path_stanext, ind_in_stanext)
if self._name_xlsx is not None:
self.add_xlsx_info(name_xlsx, ind_in_xlsx, ind_in_xlsx_matrix, clade)
def add_xlsx_info(self, name_xlsx, ind_in_xlsx, ind_in_xlsx_matrix, clade):
assert (name_xlsx is not None) and (ind_in_xlsx is not None) and (ind_in_xlsx_matrix is not None) and (clade is not None)
self._name_xlsx = name_xlsx
self._ind_in_xlsx = ind_in_xlsx
self._ind_in_xlsx_matrix = ind_in_xlsx_matrix
self._clade = clade
def add_stanext_info(self, name_stanext, path_stanext, ind_in_stanext):
assert (name_stanext is not None) and (path_stanext is not None) and (ind_in_stanext is not None)
self._name_stanext = name_stanext
self._path_stanext = path_stanext
self._ind_in_stanext = ind_in_stanext
def add_akc_info(self, name_akc, path_akc):
assert (name_akc is not None) and (path_akc is not None)
self._name_akc = name_akc
self._path_akc = path_akc
def add_additional_names(self, name_list):
self._additional_names = self._additional_names.union(set(name_list))
def add_text_info(self, text_height, text_weight, text_life_exp):
self._text_height = text_height
self._text_weight = text_weight
self._text_life_exp = text_life_exp
def get_datasets(self):
# all datasets in which this breed is found
datasets = set()
if self._name_akc is not None:
datasets.add('akc')
if self._name_stanext is not None:
datasets.add('stanext')
if self._name_xlsx is not None:
datasets.add('xlsx')
return datasets
def get_names(self):
# set of names for this breed
names = {self._abbrev, self._name_akc, self._name_stanext, self._name_xlsx, self._path_stanext}.union(self._additional_names)
names.discard(None)
return names
def get_names_as_pointing_dict(self):
# each name points to the abbreviation
names = self.get_names()
my_dict = {}
for name in names:
my_dict[name] = self._abbrev
return my_dict
def print_overview(self):
# print important information to get an overview of the class instance
if self._name_akc is not None:
name = self._name_akc
elif self._name_xlsx is not None:
name = self._name_xlsx
else:
name = self._name_stanext
print('----------------------------------------------------')
print('----- dog breed: ' + name )
print('----------------------------------------------------')
print('[names]')
print(self.get_names())
print('[datasets]')
print(self.get_datasets())
# see https://stackoverflow.com/questions/9058305/getting-attributes-of-a-class
print('[instance attributes]')
for attribute, value in self.__dict__.items():
print(attribute, '=', value)
def use_dict_to_save_class_instance(self):
my_dict = {}
for attribute, value in self.__dict__.items():
my_dict[attribute] = value
return my_dict
def use_dict_to_load_class_instance(self, my_dict):
for attribute, value in my_dict.items():
setattr(self, attribute, value)
return
# ----------------------------------------------------------------------------------------------------------------- #
def get_name_list_from_summary(summary):
name_from_abbrev_dict = {}
for breed in summary.values():
abbrev = breed._abbrev
all_names = breed.get_names()
name_from_abbrev_dict[abbrev] = list(all_names)
return name_from_abbrev_dict
def get_partial_summary(summary, part):
assert part in ['xlsx', 'akc', 'stanext']
partial_summary = {}
for key, value in summary.items():
if (part == 'xlsx' and value._name_xlsx is not None) \
or (part == 'akc' and value._name_akc is not None) \
or (part == 'stanext' and value._name_stanext is not None):
partial_summary[key] = value
return partial_summary
def get_akc_but_not_stanext_partial_summary(summary):
partial_summary = {}
for key, value in summary.items():
if value._name_akc is not None:
if value._name_stanext is None:
partial_summary[key] = value
return partial_summary
# ----------------------------------------------------------------------------------------------------------------- #
def main_load_dog_breed_classes(path_complete_abbrev_dict_v1, path_complete_summary_breeds_v1):
with open(path_complete_abbrev_dict_v1, 'rb') as file:
complete_abbrev_dict = pkl.load(file)
with open(path_complete_summary_breeds_v1, 'rb') as file:
complete_summary_breeds_attributes_only = pkl.load(file)
complete_summary_breeds = {}
for key, value in complete_summary_breeds_attributes_only.items():
attributes_only = complete_summary_breeds_attributes_only[key]
complete_summary_breeds[key] = DogBreed(abbrev=attributes_only['_abbrev'])
complete_summary_breeds[key].use_dict_to_load_class_instance(attributes_only)
return complete_abbrev_dict, complete_summary_breeds
# ----------------------------------------------------------------------------------------------------------------- #
def load_similarity_matrix_raw(xlsx_path):
# --- LOAD EXCEL FILE FROM DOG BREED PAPER
xlsx = pd.read_excel(xlsx_path)
# create an array
abbrev_indices = {}
matrix_raw = np.zeros((168, 168))
for ind in range(1, 169):
abbrev = xlsx[xlsx.columns[2]][ind]
abbrev_indices[abbrev] = ind-1
for ind_col in range(0, 168):
for ind_row in range(0, 168):
matrix_raw[ind_col, ind_row] = float(xlsx[xlsx.columns[3+ind_col]][1+ind_row])
return matrix_raw, abbrev_indices
# ----------------------------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------------------------- #
# load the (in advance created) final dict of dog breed classes
ROOT_PATH_BREED_DATA = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', '..', 'data', 'breed_data')
path_complete_abbrev_dict_v1 = os.path.join(ROOT_PATH_BREED_DATA, 'complete_abbrev_dict_v2.pkl')
path_complete_summary_breeds_v1 = os.path.join(ROOT_PATH_BREED_DATA, 'complete_summary_breeds_v2.pkl')
COMPLETE_ABBREV_DICT, COMPLETE_SUMMARY_BREEDS = main_load_dog_breed_classes(path_complete_abbrev_dict_v1, path_complete_summary_breeds_v1)
# load similarity matrix, data from:
# Parker H. G., Dreger D. L., Rimbault M., Davis B. W., Mullen A. B., Carpintero-Ramirez G., and Ostrander E. A.
# Genomic analyses reveal the influence of geographic origin, migration, and hybridization on modern dog breed
# development. Cell Reports, 4(19):697–708, 2017.
xlsx_path = os.path.join(ROOT_PATH_BREED_DATA, 'NIHMS866262-supplement-2.xlsx')
SIM_MATRIX_RAW, SIM_ABBREV_INDICES = load_similarity_matrix_raw(xlsx_path)