File size: 8,099 Bytes
7629b39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import pandas as pd
import difflib
import json
import pickle as pkl
import csv
import numpy as np


# ----------------------------------------------------------------------------------------------------------------- #
class DogBreed(object):
    def __init__(self, abbrev, name_akc=None, name_stanext=None, name_xlsx=None, path_akc=None, path_stanext=None, ind_in_xlsx=None, ind_in_xlsx_matrix=None, ind_in_stanext=None, clade=None):
        self._abbrev = abbrev 
        self._name_xlsx = name_xlsx 
        self._name_akc = name_akc
        self._name_stanext = name_stanext
        self._path_stanext = path_stanext
        self._additional_names = set()
        if self._name_akc is not None:
            self.add_akc_info(name_akc, path_akc)
        if self._name_stanext is not None:
            self.add_stanext_info(name_stanext, path_stanext, ind_in_stanext)
        if self._name_xlsx is not None:
            self.add_xlsx_info(name_xlsx, ind_in_xlsx, ind_in_xlsx_matrix, clade)
    def add_xlsx_info(self, name_xlsx, ind_in_xlsx, ind_in_xlsx_matrix, clade):
        assert (name_xlsx is not None) and (ind_in_xlsx is not None) and (ind_in_xlsx_matrix is not None) and (clade is not None)
        self._name_xlsx = name_xlsx
        self._ind_in_xlsx = ind_in_xlsx
        self._ind_in_xlsx_matrix = ind_in_xlsx_matrix
        self._clade = clade
    def add_stanext_info(self, name_stanext, path_stanext, ind_in_stanext):
        assert (name_stanext is not None) and (path_stanext is not None) and (ind_in_stanext is not None)
        self._name_stanext = name_stanext
        self._path_stanext = path_stanext
        self._ind_in_stanext = ind_in_stanext
    def add_akc_info(self, name_akc, path_akc):
        assert (name_akc is not None) and (path_akc is not None)
        self._name_akc = name_akc
        self._path_akc = path_akc
    def add_additional_names(self, name_list):
        self._additional_names = self._additional_names.union(set(name_list)) 
    def add_text_info(self, text_height, text_weight, text_life_exp):
        self._text_height = text_height
        self._text_weight = text_weight
        self._text_life_exp = text_life_exp
    def get_datasets(self):
        # all datasets in which this breed is found
        datasets = set()
        if self._name_akc is not None:
            datasets.add('akc')
        if self._name_stanext is not None:
            datasets.add('stanext')
        if self._name_xlsx is not None:
            datasets.add('xlsx')
        return datasets
    def get_names(self):
        # set of names for this breed
        names = {self._abbrev, self._name_akc, self._name_stanext, self._name_xlsx, self._path_stanext}.union(self._additional_names)
        names.discard(None)
        return names
    def get_names_as_pointing_dict(self):
        # each name points to the abbreviation
        names = self.get_names()
        my_dict = {}
        for name in names:
            my_dict[name] = self._abbrev
        return my_dict
    def print_overview(self):
        # print important information to get an overview of the class instance
        if self._name_akc is not None:
            name = self._name_akc
        elif self._name_xlsx is not None:
            name = self._name_xlsx
        else:
            name = self._name_stanext
        print('----------------------------------------------------')
        print('----- dog breed: ' + name )
        print('----------------------------------------------------')
        print('[names]')
        print(self.get_names())
        print('[datasets]')
        print(self.get_datasets())
        # see https://stackoverflow.com/questions/9058305/getting-attributes-of-a-class
        print('[instance attributes]')
        for attribute, value in self.__dict__.items():
            print(attribute, '=', value)
    def use_dict_to_save_class_instance(self):
        my_dict = {}
        for attribute, value in self.__dict__.items():
            my_dict[attribute] = value
        return my_dict
    def use_dict_to_load_class_instance(self, my_dict):
        for attribute, value in my_dict.items():
            setattr(self, attribute, value)
        return 

# ----------------------------------------------------------------------------------------------------------------- #
def get_name_list_from_summary(summary):
    name_from_abbrev_dict = {}
    for breed in summary.values():
        abbrev = breed._abbrev
        all_names = breed.get_names()
        name_from_abbrev_dict[abbrev] = list(all_names)
    return name_from_abbrev_dict
def get_partial_summary(summary, part):
    assert part in ['xlsx', 'akc', 'stanext']
    partial_summary = {}
    for key, value in summary.items():
        if (part == 'xlsx' and value._name_xlsx is not None) \
            or (part == 'akc' and value._name_akc is not None) \
            or (part == 'stanext' and value._name_stanext is not None):
            partial_summary[key] = value
    return partial_summary
def get_akc_but_not_stanext_partial_summary(summary):
    partial_summary = {}
    for key, value in summary.items():
        if value._name_akc is not None:
            if value._name_stanext is None:
                partial_summary[key] = value
    return partial_summary    

# ----------------------------------------------------------------------------------------------------------------- #
def main_load_dog_breed_classes(path_complete_abbrev_dict_v1, path_complete_summary_breeds_v1):
    with open(path_complete_abbrev_dict_v1, 'rb') as file:
        complete_abbrev_dict = pkl.load(file)
    with open(path_complete_summary_breeds_v1, 'rb') as file: 
        complete_summary_breeds_attributes_only = pkl.load(file)
    
    complete_summary_breeds = {}
    for key, value in complete_summary_breeds_attributes_only.items():
        attributes_only = complete_summary_breeds_attributes_only[key]
        complete_summary_breeds[key] = DogBreed(abbrev=attributes_only['_abbrev'])
        complete_summary_breeds[key].use_dict_to_load_class_instance(attributes_only)
    return complete_abbrev_dict, complete_summary_breeds


# ----------------------------------------------------------------------------------------------------------------- #
def load_similarity_matrix_raw(xlsx_path):
    # --- LOAD EXCEL FILE FROM DOG BREED PAPER
    xlsx = pd.read_excel(xlsx_path)
    # create an array
    abbrev_indices = {}
    matrix_raw = np.zeros((168, 168))
    for ind in range(1, 169):
        abbrev = xlsx[xlsx.columns[2]][ind]
        abbrev_indices[abbrev] = ind-1
    for ind_col in range(0, 168):
        for ind_row in range(0, 168):
            matrix_raw[ind_col, ind_row] = float(xlsx[xlsx.columns[3+ind_col]][1+ind_row])
    return matrix_raw, abbrev_indices



# ----------------------------------------------------------------------------------------------------------------- #
# ----------------------------------------------------------------------------------------------------------------- #
# load the (in advance created) final dict of dog breed classes
ROOT_PATH_BREED_DATA = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..', '..', 'data', 'breed_data')
path_complete_abbrev_dict_v1 = os.path.join(ROOT_PATH_BREED_DATA, 'complete_abbrev_dict_v2.pkl')
path_complete_summary_breeds_v1 = os.path.join(ROOT_PATH_BREED_DATA, 'complete_summary_breeds_v2.pkl')
COMPLETE_ABBREV_DICT, COMPLETE_SUMMARY_BREEDS = main_load_dog_breed_classes(path_complete_abbrev_dict_v1, path_complete_summary_breeds_v1)
# load similarity matrix, data from: 
#   Parker H. G., Dreger D. L., Rimbault M., Davis B. W., Mullen A. B., Carpintero-Ramirez G., and Ostrander E. A.
#   Genomic analyses reveal the influence of geographic origin, migration, and hybridization on modern dog breed 
#   development. Cell Reports, 4(19):697–708, 2017.
xlsx_path = os.path.join(ROOT_PATH_BREED_DATA, 'NIHMS866262-supplement-2.xlsx')
SIM_MATRIX_RAW, SIM_ABBREV_INDICES = load_similarity_matrix_raw(xlsx_path)