Spaces:
Runtime error
Runtime error
File size: 8,099 Bytes
7629b39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pandas as pd
import difflib
import json
import pickle as pkl
import csv
import numpy as np
# ----------------------------------------------------------------------------------------------------------------- #
class DogBreed(object):
def __init__(self, abbrev, name_akc=None, name_stanext=None, name_xlsx=None, path_akc=None, path_stanext=None, ind_in_xlsx=None, ind_in_xlsx_matrix=None, ind_in_stanext=None, clade=None):
self._abbrev = abbrev
self._name_xlsx = name_xlsx
self._name_akc = name_akc
self._name_stanext = name_stanext
self._path_stanext = path_stanext
self._additional_names = set()
if self._name_akc is not None:
self.add_akc_info(name_akc, path_akc)
if self._name_stanext is not None:
self.add_stanext_info(name_stanext, path_stanext, ind_in_stanext)
if self._name_xlsx is not None:
self.add_xlsx_info(name_xlsx, ind_in_xlsx, ind_in_xlsx_matrix, clade)
def add_xlsx_info(self, name_xlsx, ind_in_xlsx, ind_in_xlsx_matrix, clade):
assert (name_xlsx is not None) and (ind_in_xlsx is not None) and (ind_in_xlsx_matrix is not None) and (clade is not None)
self._name_xlsx = name_xlsx
self._ind_in_xlsx = ind_in_xlsx
self._ind_in_xlsx_matrix = ind_in_xlsx_matrix
self._clade = clade
def add_stanext_info(self, name_stanext, path_stanext, ind_in_stanext):
assert (name_stanext is not None) and (path_stanext is not None) and (ind_in_stanext is not None)
self._name_stanext = name_stanext
self._path_stanext = path_stanext
self._ind_in_stanext = ind_in_stanext
def add_akc_info(self, name_akc, path_akc):
assert (name_akc is not None) and (path_akc is not None)
self._name_akc = name_akc
self._path_akc = path_akc
def add_additional_names(self, name_list):
self._additional_names = self._additional_names.union(set(name_list))
def add_text_info(self, text_height, text_weight, text_life_exp):
self._text_height = text_height
self._text_weight = text_weight
self._text_life_exp = text_life_exp
def get_datasets(self):
# all datasets in which this breed is found
datasets = set()
if self._name_akc is not None:
datasets.add('akc')
if self._name_stanext is not None:
datasets.add('stanext')
if self._name_xlsx is not None:
datasets.add('xlsx')
return datasets
def get_names(self):
# set of names for this breed
names = {self._abbrev, self._name_akc, self._name_stanext, self._name_xlsx, self._path_stanext}.union(self._additional_names)
names.discard(None)
return names
def get_names_as_pointing_dict(self):
# each name points to the abbreviation
names = self.get_names()
my_dict = {}
for name in names:
my_dict[name] = self._abbrev
return my_dict
def print_overview(self):
# print important information to get an overview of the class instance
if self._name_akc is not None:
name = self._name_akc
elif self._name_xlsx is not None:
name = self._name_xlsx
else:
name = self._name_stanext
print('----------------------------------------------------')
print('----- dog breed: ' + name )
print('----------------------------------------------------')
print('[names]')
print(self.get_names())
print('[datasets]')
print(self.get_datasets())
# see https://stackoverflow.com/questions/9058305/getting-attributes-of-a-class
print('[instance attributes]')
for attribute, value in self.__dict__.items():
print(attribute, '=', value)
def use_dict_to_save_class_instance(self):
my_dict = {}
for attribute, value in self.__dict__.items():
my_dict[attribute] = value
return my_dict
def use_dict_to_load_class_instance(self, my_dict):
for attribute, value in my_dict.items():
setattr(self, attribute, value)
return
# ----------------------------------------------------------------------------------------------------------------- #
def get_name_list_from_summary(summary):
name_from_abbrev_dict = {}
for breed in summary.values():
abbrev = breed._abbrev
all_names = breed.get_names()
name_from_abbrev_dict[abbrev] = list(all_names)
return name_from_abbrev_dict
def get_partial_summary(summary, part):
assert part in ['xlsx', 'akc', 'stanext']
partial_summary = {}
for key, value in summary.items():
if (part == 'xlsx' and value._name_xlsx is not None) \
or (part == 'akc' and value._name_akc is not None) \
or (part == 'stanext' and value._name_stanext is not None):
partial_summary[key] = value
return partial_summary
def get_akc_but_not_stanext_partial_summary(summary):
partial_summary = {}
for key, value in summary.items():
if value._name_akc is not None:
if value._name_stanext is None:
partial_summary[key] = value
return partial_summary
# ----------------------------------------------------------------------------------------------------------------- #
def main_load_dog_breed_classes(path_complete_abbrev_dict_v1, path_complete_summary_breeds_v1):
with open(path_complete_abbrev_dict_v1, 'rb') as file:
complete_abbrev_dict = pkl.load(file)
with open(path_complete_summary_breeds_v1, 'rb') as file:
complete_summary_breeds_attributes_only = pkl.load(file)
complete_summary_breeds = {}
for key, value in complete_summary_breeds_attributes_only.items():
attributes_only = complete_summary_breeds_attributes_only[key]
complete_summary_breeds[key] = DogBreed(abbrev=attributes_only['_abbrev'])
complete_summary_breeds[key].use_dict_to_load_class_instance(attributes_only)
return complete_abbrev_dict, complete_summary_breeds
# ----------------------------------------------------------------------------------------------------------------- #
def load_similarity_matrix_raw(xlsx_path):
# --- LOAD EXCEL FILE FROM DOG BREED PAPER
xlsx = pd.read_excel(xlsx_path)
# create an array
abbrev_indices = {}
matrix_raw = np.zeros((168, 168))
for ind in range(1, 169):
abbrev = xlsx[xlsx.columns[2]][ind]
abbrev_indices[abbrev] = ind-1
for ind_col in range(0, 168):
for ind_row in range(0, 168):
matrix_raw[ind_col, ind_row] = float(xlsx[xlsx.columns[3+ind_col]][1+ind_row])
return matrix_raw, abbrev_indices
# ----------------------------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------------------------- #
# load the (in advance created) final dict of dog breed classes
ROOT_PATH_BREED_DATA = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', '..', 'data', 'breed_data')
path_complete_abbrev_dict_v1 = os.path.join(ROOT_PATH_BREED_DATA, 'complete_abbrev_dict_v2.pkl')
path_complete_summary_breeds_v1 = os.path.join(ROOT_PATH_BREED_DATA, 'complete_summary_breeds_v2.pkl')
COMPLETE_ABBREV_DICT, COMPLETE_SUMMARY_BREEDS = main_load_dog_breed_classes(path_complete_abbrev_dict_v1, path_complete_summary_breeds_v1)
# load similarity matrix, data from:
# Parker H. G., Dreger D. L., Rimbault M., Davis B. W., Mullen A. B., Carpintero-Ramirez G., and Ostrander E. A.
# Genomic analyses reveal the influence of geographic origin, migration, and hybridization on modern dog breed
# development. Cell Reports, 4(19):697β708, 2017.
xlsx_path = os.path.join(ROOT_PATH_BREED_DATA, 'NIHMS866262-supplement-2.xlsx')
SIM_MATRIX_RAW, SIM_ABBREV_INDICES = load_similarity_matrix_raw(xlsx_path)
|