yifanxie commited on
Commit
5c4b960
·
1 Parent(s): 12898d6

simple commit with util codes

Browse files
app.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import streamlit as st
2
+ st.title('Numerai Dashboard')
project_tools/__init__.py ADDED
File without changes
project_tools/numerapi_utils.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numerapi
2
+ from numerapi import utils
3
+ from project_tools import project_config, project_utils
4
+ from typing import List, Dict
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ napi = numerapi.NumerAPI()
9
+
10
+
11
+ # def get_round
12
+
13
+
14
+ # depreciated
15
+ # def get_model_history(model):
16
+ # res = napi.daily_user_performances(model)
17
+ # res = pd.DataFrame.from_dict(res)
18
+ # res['payoutPending'] = res['payoutPending'].astype(np.float64)
19
+ # res['payoutSettled'] = res['payoutSettled'].astype(np.float64)
20
+ # res['stakeValue'] = res['stakeValue'].astype(np.float64)
21
+ # res['deltaRatio'] = res['payoutPending'] / res['stakeValue']
22
+ # res['realised_pl'] = project_utils.series_reverse_cumsum(res['payoutSettled'])
23
+ # res['floating_pl'] = project_utils.series_reverse_cumsum(res['payoutPending']) - res['realised_pl']
24
+ # res['current_stake'] = res['stakeValue'] - res['floating_pl']
25
+ # rename_dict = {'stakeValue':'floating_stake'}
26
+ # res = res.rename(columns=rename_dict)
27
+ # # res['equity'] = res['stakeValue'] + res['floating_pl']
28
+ # # cols = res.columns.tolist()
29
+ # # res = res[['model'] + cols]
30
+ #
31
+ # res['model'] = model
32
+ # cols = ['model', 'date', 'current_stake', 'floating_stake', 'payoutPending', 'floating_pl', 'realised_pl']
33
+ # res = res[cols]
34
+ # return res
35
+
36
+
37
+ def get_portfolio_overview(models, onlylatest=True):
38
+ res_df = []
39
+ for m in models:
40
+ # try:
41
+ print(f'extracting information for model {m}')
42
+ if onlylatest:
43
+ mdf = get_model_history_v3(m).loc[0:0]
44
+ else:
45
+ mdf = get_model_history_v3(m)
46
+ res_df.append(mdf)
47
+ # except:
48
+ # print(f'no information for model {m} is available')
49
+ if len(res_df)>0:
50
+ res_df = pd.concat(res_df, axis=0)
51
+ # res_df['date'] = res_df['date'].dt.date
52
+ if onlylatest:
53
+ return res_df.sort_values(by='floating_pl', ascending=False).reset_index(drop=True)
54
+ else:
55
+ return res_df.reset_index(drop=True)
56
+ else:
57
+ return None
58
+
59
+
60
+
61
+
62
+
63
+
64
+ def get_competitions(tournament=8):
65
+ """Retrieves information about all competitions
66
+ Args:
67
+ tournament (int, optional): ID of the tournament, defaults to 8
68
+ -- DEPRECATED there is only one tournament nowadays
69
+ Returns:
70
+ list of dicts: list of rounds
71
+ Each round's dict contains the following items:
72
+ * datasetId (`str`)
73
+ * number (`int`)
74
+ * openTime (`datetime`)
75
+ * resolveTime (`datetime`)
76
+ * participants (`int`): number of participants
77
+ * prizePoolNmr (`decimal.Decimal`)
78
+ * prizePoolUsd (`decimal.Decimal`)
79
+ * resolvedGeneral (`bool`)
80
+ * resolvedStaking (`bool`)
81
+ * ruleset (`string`)
82
+ Example:
83
+ >>> NumerAPI().get_competitions()
84
+ [
85
+ {'datasetId': '59a70840ca11173c8b2906ac',
86
+ 'number': 71,
87
+ 'openTime': datetime.datetime(2017, 8, 31, 0, 0),
88
+ 'resolveTime': datetime.datetime(2017, 9, 27, 21, 0),
89
+ 'participants': 1287,
90
+ 'prizePoolNmr': Decimal('0.00'),
91
+ 'prizePoolUsd': Decimal('6000.00'),
92
+ 'resolvedGeneral': True,
93
+ 'resolvedStaking': True,
94
+ 'ruleset': 'p_auction'
95
+ },
96
+ ..
97
+ ]
98
+ """
99
+ # self.logger.info("getting rounds...")
100
+
101
+ query = '''
102
+ query($tournament: Int!) {
103
+ rounds(tournament: $tournament) {
104
+ number
105
+ resolveTime
106
+ openTime
107
+ resolvedGeneral
108
+ resolvedStaking
109
+ }
110
+ }
111
+ '''
112
+ arguments = {'tournament': tournament}
113
+ result = napi.raw_query(query, arguments)
114
+ rounds = result['data']['rounds']
115
+ # convert datetime strings to datetime.datetime objects
116
+ for r in rounds:
117
+ utils.replace(r, "openTime", utils.parse_datetime_string)
118
+ utils.replace(r, "resolveTime", utils.parse_datetime_string)
119
+ utils.replace(r, "prizePoolNmr", utils.parse_float_string)
120
+ utils.replace(r, "prizePoolUsd", utils.parse_float_string)
121
+ return rounds
122
+
123
+
124
+ def daily_submissions_performances(username: str) -> List[Dict]:
125
+ """Fetch daily performance of a user's submissions.
126
+ Args:
127
+ username (str)
128
+ Returns:
129
+ list of dicts: list of daily submission performance entries
130
+ For each entry in the list, there is a dict with the following
131
+ content:
132
+ * date (`datetime`)
133
+ * correlation (`float`)
134
+ * roundNumber (`int`)
135
+ * mmc (`float`): metamodel contribution
136
+ * fnc (`float`): feature neutral correlation
137
+ * correlationWithMetamodel (`float`)
138
+ Example:
139
+ >>> api = NumerAPI()
140
+ >>> api.daily_user_performances("uuazed")
141
+ [{'roundNumber': 181,
142
+ 'correlation': -0.011765912,
143
+ 'date': datetime.datetime(2019, 10, 16, 0, 0),
144
+ 'mmc': 0.3,
145
+ 'fnc': 0.1,
146
+ 'correlationWithMetamodel': 0.87},
147
+ ...
148
+ ]
149
+ """
150
+ query = """
151
+ query($username: String!) {
152
+ v2UserProfile(username: $username) {
153
+ dailySubmissionPerformances {
154
+ date
155
+ correlation
156
+ corrPercentile
157
+ roundNumber
158
+ mmc
159
+ mmcPercentile
160
+ fnc
161
+ fncPercentile
162
+ correlationWithMetamodel
163
+ }
164
+ }
165
+ }
166
+ """
167
+ arguments = {'username': username}
168
+ data = napi.raw_query(query, arguments)['data']['v2UserProfile']
169
+ performances = data['dailySubmissionPerformances']
170
+ # convert strings to python objects
171
+ for perf in performances:
172
+ utils.replace(perf, "date", utils.parse_datetime_string)
173
+ # remove useless items
174
+ performances = [p for p in performances
175
+ if any([p['correlation'], p['fnc'], p['mmc']])]
176
+ return performances
177
+
178
+
179
+ def daily_submissions_performances_V3(modelname: str) -> List[Dict]:
180
+ query = """
181
+ query($modelName: String!) {
182
+ v3UserProfile(modelName: $modelName) {
183
+ roundModelPerformances{
184
+ roundNumber
185
+ roundResolveTime
186
+ corr
187
+ corrPercentile
188
+ mmc
189
+ mmcMultiplier
190
+ mmcPercentile
191
+ tc
192
+ tcPercentile
193
+ tcMultiplier
194
+ fncV3
195
+ fncV3Percentile
196
+ corrWMetamodel
197
+ payout
198
+ roundResolved
199
+ roundResolveTime
200
+ corrMultiplier
201
+ mmcMultiplier
202
+ selectedStakeValue
203
+ }
204
+ stakeValue
205
+ nmrStaked
206
+ }
207
+ }
208
+ """
209
+ arguments = {'modelName': modelname}
210
+ data = napi.raw_query(query, arguments)['data']['v3UserProfile']
211
+ performances = data['roundModelPerformances']
212
+ # convert strings to python objects
213
+ for perf in performances:
214
+ utils.replace(perf, "date", utils.parse_datetime_string)
215
+ # remove useless items
216
+ performances = [p for p in performances
217
+ if any([p['corr'], p['tc'], p['mmc']])]
218
+ return performances
219
+
220
+
221
+ def get_lb_models(limit=20000, offset=0):
222
+ query = """
223
+ query($limit: Int, $offset: Int){
224
+ v2Leaderboard(limit:$limit, offset:$offset){
225
+ username
226
+ }
227
+ }
228
+ """
229
+ arguments = {'limit':limit, 'offset':offset}
230
+ data = napi.raw_query(query, arguments)['data']['v2Leaderboard']
231
+ model_list = [i['username'] for i in data]
232
+ return model_list
233
+
234
+
235
+
236
+ def get_round_model_performance(roundNumber: int, model: str):
237
+ query = """
238
+ query($roundNumber: Int!, $username: String!) {
239
+ roundSubmissionPerformance(roundNumber: $roundNumber, username: $username) {
240
+ corrMultiplier
241
+ mmcMultiplier
242
+ roundDailyPerformances{
243
+ correlation
244
+ mmc
245
+ corrPercentile
246
+ mmcPercentile
247
+ payoutPending
248
+ }
249
+ selectedStakeValue
250
+ }
251
+ }
252
+ """
253
+ arguments = {'roundNumber': roundNumber,'username': model}
254
+ data = napi.raw_query(query, arguments)['data']['roundSubmissionPerformance']
255
+ latest_performance = data['roundDailyPerformances'][-1] #[-1] ### issue with order
256
+ res = {}
257
+ res['model'] = model
258
+ res['roundNumber'] = roundNumber
259
+ res['corrMultiplier'] = data['corrMultiplier']
260
+ res['mmcMultiplier'] = data['mmcMultiplier']
261
+ res['selectedStakeValue'] = data['selectedStakeValue']
262
+ for key in latest_performance.keys():
263
+ res[key] = latest_performance[key]
264
+ return res
265
+
266
+
267
+
268
+
269
+ def get_user_profile(username: str) -> List[Dict]:
270
+ """Fetch daily performance of a user's submissions.
271
+ Args:
272
+ username (str)
273
+ Returns:
274
+ list of dicts: list of daily submission performance entries
275
+ For each entry in the list, there is a dict with the following
276
+ content:
277
+ * date (`datetime`)
278
+ * correlation (`float`)
279
+ * roundNumber (`int`)
280
+ * mmc (`float`): metamodel contribution
281
+ * fnc (`float`): feature neutral correlation
282
+ * correlationWithMetamodel (`float`)
283
+ Example:
284
+ >>> api = NumerAPI()
285
+ >>> api.daily_user_performances("uuazed")
286
+ [{'roundNumber': 181,
287
+ 'correlation': -0.011765912,
288
+ 'date': datetime.datetime(2019, 10, 16, 0, 0),
289
+ 'mmc': 0.3,
290
+ 'fnc': 0.1,
291
+ 'correlationWithMetamodel': 0.87},
292
+ ...
293
+ ]
294
+ """
295
+ query = """
296
+ query($username: String!) {
297
+ v2UserProfile(username: $username) {
298
+ dailySubmissionPerformances {
299
+ date
300
+ correlation
301
+ corrPercentile
302
+ roundNumber
303
+ mmc
304
+ mmcPercentile
305
+ fnc
306
+ fncPercentile
307
+ correlationWithMetamodel
308
+ }
309
+ }
310
+ }
311
+ """
312
+ arguments = {'username': username}
313
+ data = napi.raw_query(query, arguments)['data']#['v2UserProfile']
314
+ # performances = data['dailySubmissionPerformances']
315
+ # # convert strings to python objects
316
+ # for perf in performances:
317
+ # utils.replace(perf, "date", utils.parse_datetime_string)
318
+ # # remove useless items
319
+ # performances = [p for p in performances
320
+ # if any([p['correlation'], p['fnc'], p['mmc']])]
321
+ return data
322
+
323
+
324
+ def download_dataset(filename: str, dest_path: str = None,
325
+ round_num: int = None) -> None:
326
+ """ Download specified file for the current active round.
327
+
328
+ Args:
329
+ filename (str): file to be downloaded
330
+ dest_path (str, optional): complate path where the file should be
331
+ stored, defaults to the same name as the source file
332
+ round_num (int, optional): tournament round you are interested in.
333
+ defaults to the current round
334
+ tournament (int, optional): ID of the tournament, defaults to 8
335
+
336
+ Example:
337
+ >>> filenames = NumerAPI().list_datasets()
338
+ >>> NumerAPI().download_dataset(filenames[0]}")
339
+ """
340
+ if dest_path is None:
341
+ dest_path = filename
342
+
343
+ query = """
344
+ query ($filename: String!
345
+ $round: Int) {
346
+ dataset(filename: $filename
347
+ round: $round)
348
+ }
349
+ """
350
+ args = {'filename': filename, "round": round_num}
351
+
352
+ dataset_url = napi.raw_query(query, args)['data']['dataset']
353
+ utils.download_file(dataset_url, dest_path, show_progress_bars=True)
354
+
355
+
356
+
357
+ # function using V3UserProfile
358
+
359
+ def model_payout_history(model):
360
+ napi = numerapi.NumerAPI()
361
+ query = """
362
+ query($model: String!) {
363
+ v3UserProfile(modelName: $model) {
364
+ roundModelPerformances{
365
+ payout
366
+ roundNumber
367
+ roundResolved
368
+ roundResolveTime
369
+ corrMultiplier
370
+ mmcMultiplier
371
+ selectedStakeValue
372
+ }
373
+ stakeValue
374
+ nmrStaked
375
+ }
376
+ }
377
+ """
378
+ arguments = {'model': model}
379
+ payout_info = napi.raw_query(query, arguments)['data']['v3UserProfile']['roundModelPerformances']
380
+ payout_info = pd.DataFrame.from_dict(payout_info)
381
+ payout_info = payout_info[~pd.isnull(payout_info['payout'])].reset_index(drop=True)
382
+ return payout_info
383
+
384
+
385
+ def get_model_history_v3(model):
386
+ res = model_payout_history(model)
387
+ res = pd.DataFrame.from_dict(res)
388
+ res['payout'] = res['payout'].astype(np.float64)
389
+ res['current_stake'] = res['selectedStakeValue'].astype(np.float64)
390
+ res['payout_cumsum'] = project_utils.series_reverse_cumsum(res['payout'])
391
+ res['date'] = pd.to_datetime(res['roundResolveTime']).dt.date
392
+
393
+ res['realised_pl'] = res['payout_cumsum']
394
+ latest_realised_pl = res[res['roundResolved'] == True]['payout_cumsum'].values[0]
395
+ res.loc[res['roundResolved'] == False, 'realised_pl'] = latest_realised_pl
396
+
397
+ res['floating_pl'] = 0
398
+ payoutPending_values = res[res['roundResolved'] == False]['payout'].values
399
+ payoutPending_cumsum = payoutPending_values[::-1].cumsum()[::-1]
400
+ res.loc[res['roundResolved'] == False, 'floating_pl'] = payoutPending_cumsum
401
+
402
+ res['model'] = model
403
+ # res['floating_pl'] = res['current_stake'] + res['payoutPending']
404
+ res['floating_stake'] = res['current_stake'] + res['floating_pl']
405
+ cols = ['model', 'date', 'current_stake', 'floating_stake', 'payout', 'floating_pl', 'realised_pl', 'roundResolved',
406
+ 'roundNumber']
407
+ res = res[cols]
408
+ return res
409
+
410
+
411
+
412
+
413
+
414
+
project_tools/project_config.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.dirname(os.getcwd()))
4
+
5
+ DATETIME_FORMAT1 = '%Y%m%d%H%M'
6
+ DATETIME_FORMAT2 = '%Y/%m/%d %H:%M'
7
+ DATETIME_FORMAT3 = '%Y-%m-%d'
8
+ SAVE_LOCAL_COPY = True
9
+
10
+ BENCHMARK_MODELS = ['integration_test', 'integration_test_7'] #'budbot_7'] #'integration_test_7'
11
+ MODEL_ROUND_RESULT_FILE = '../feature_data/model_round_result.pkl'
12
+ MODEL_DAILY_RESULT_FILE = '../feature_data/model_daily_result.pkl'
13
+
14
+ NUMERATI_URL = 'https://raw.githubusercontent.com/woobe/numerati/master/data.csv'
15
+ NUMERATI_FILE = '../feature_data/numerati_data.pkl'
16
+ FEATURE_PATH = '../feature_data/'
17
+
18
+
19
+
20
+ # to be discarded
21
+ MODEL_NAMES = ['yxbot', 'yxbot2', 'sforest_baihu', 'stree_qinlong', 'flyingbus_mcv6', 'starry_night','fish_and_chips', 'rogue_planet', 'three_body_problem', 'grinning_cat', 'schrodingers_cat', 'omega_weapon', 'ifirit','dark_bahamut', 'wen_score', 'qinlong', 'baihu','marlboro', 'hell_cerberus', 'fuxi', 'roci_fuxi', 'kupo_mcv7', 'yxbot_mcv2', 'yxbot_mcv10']
22
+
23
+
24
+ NEW_MODEL_NAMES = ['yxbot3_m15', 'yxbot4_m23', 'yxbot5', 'yxbot6_m16', 'yxbot7_m17', 'yxbot_a10b8', 'yxbot9_m24', 'yxbot_a10', 'yxbot_a10xu', 'yxbot_a10bk','yxbot_a11', 'yxbot_a12', 'yxbot_ultima_weapon', 'yxbot_valkyrie', 'yxbot_bearmate', 'yxbot_dracula','yxbot_a13', 'yxbot_a14', 'yxbot15_zhuque', 'yxbot_redhare', 'yxbot_a15', 'yxbot18_m25', 'yxbot11_x302']
25
+
26
+ # flyingbus
27
+
28
+ TOP_LB = ['mdl3', 'nescience', 'sapphirescipionyx','quantaquetzalcoatlus', 'anna13', 'mercuryai', 'uuazed6', 'rosetta', 'sinookas']
29
+
30
+
31
+ TP3M = ['ageonsen', 'davebaty', 'wallingford_nut', 'filipstefano2', 'davat6', 'lions', 'wsw', 'lottery_of_babylon', 'kup_choy_n', 'pinky_and_the_brain']
32
+
33
+
34
+ TP1Y = ['hiryuu', 'victoria', 'benben11', 'usigma7', 'crystal_sphere', 'era__mix__2000', 'rgb_alpha', 'smokh', 'shoukaku', 'stables', 'deepnum', 'botarai', 'zuikaku', 'kond']
35
+
36
+
37
+ ARBITRAGE_MODELS = ['arbitrage', 'arbitrage2', 'arbitrage3', 'arbitrage4', 'leverage', 'leverage2', 'leverage3', 'culebracapital', 'culebracapital2', 'culebracapital3']
38
+
39
+
40
+ IAAI_MODELS = ['ia_ai', 'the_aijoe4','i_like_the_coin_08', 'i_like_the_coin_09', 'i_like_the_coin_10']
41
+
42
+
43
+ RESTRADE_MODELS = ['restrading', 'restrading2', 'restrading3', 'restrading4', 'restrading5', 'restrading6', 'restrading7', 'restrading8', 'restrading9']
44
+
45
+ MCV_MODELS = ['mcv', 'mcv2', 'mcv3', 'mcv4', 'mcv5','mcv6','mcv7','mcv8','mcv9','mcv10','mcv11','mcv12','mcv13']
46
+ MCV_NEW_MODELS = ['mcv14', 'mcv15', 'mcv16', 'mcv17', 'mcv18', 'mcv19', 'mcv20', 'mcv21', 'mcv22', 'mcv23', 'mcv24', 'mcv25', 'mcv26', 'mcv27', 'mcv28', 'mcv29', 'mcv30', 'mcv31', 'mcv32', 'mcv33', 'mcv34', 'mcv35', 'mcv36', 'mcv37', 'mcv38', 'mcv39', 'mcv40', 'mcv41', 'mcv42', 'mcv43', 'mcv44', 'mcv45', 'mcv46', 'mcv47', 'mcv48', 'mcv49', 'mcv50']
47
+
project_tools/project_utils.py ADDED
@@ -0,0 +1,813 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ import pickle
5
+ import time
6
+ from contextlib import contextmanager
7
+ from importlib import reload
8
+ import re
9
+ from project_tools import project_config, project_utils, numerapi_utils
10
+ import glob
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ from random import randint, random
14
+ import itertools
15
+ import scipy
16
+ from scipy.stats import ks_2samp
17
+ from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, mean_squared_error
18
+ from sklearn.preprocessing import MinMaxScaler, StandardScaler
19
+ from sklearn.pipeline import make_pipeline
20
+ from sklearn import linear_model
21
+ import datetime
22
+ import json
23
+ from collections import OrderedDict
24
+ from os import listdir
25
+ from os.path import isfile, join, isdir
26
+ import glob
27
+ import numerapi
28
+ import itertools
29
+ import io
30
+ import requests
31
+ from pathlib import Path
32
+ from scipy.stats.mstats import gmean
33
+ from typing import List, Dict
34
+
35
+
36
+ napi = numerapi.NumerAPI() #verbosity="info")
37
+
38
+
39
+ def get_time_string():
40
+ """
41
+ Generate a time string representation of the time of call of this function.
42
+ :param None
43
+ :return: a string that represent the time of the functional call.
44
+ """
45
+ now = datetime.datetime.now()
46
+ now = str(now.strftime('%Y%m%d%H%M'))
47
+ return now
48
+
49
+
50
+ def reload_project():
51
+ """
52
+ utility function used during experimentation to reload various model when required, useful for quick experiment iteration
53
+ :return: None
54
+ """
55
+ reload(project_config)
56
+ reload(project_utils)
57
+ reload(numerapi_utils)
58
+
59
+ @contextmanager
60
+ def timer(name):
61
+ """
62
+ utility timer function to check how long a piece of code might take to run.
63
+ :param name: name of the code fragment to be timed
64
+ :yield: time taken for the code to run
65
+ """
66
+ t0 = time.time()
67
+ print('[%s] in progress' % name)
68
+ yield
69
+ print('[%s] done in %.6f s' %(name, time.time() - t0))
70
+
71
+
72
+
73
+ def load_data(pickle_file):
74
+ """
75
+ load pickle data from file
76
+ :param pickle_file: path of pickle data
77
+ :return: data stored in pickle file
78
+ """
79
+ load_file = open(pickle_file, 'rb')
80
+ data = pickle.load(load_file)
81
+ return data
82
+
83
+
84
+ def pickle_data(path, data, protocol=-1, timestamp=False, verbose=True):
85
+ """
86
+ Pickle data to specified file
87
+ :param path: full path of file where data will be pickled to
88
+ :param data: data to be pickled
89
+ :param protocol: pickle protocol, -1 indicate to use the latest protocol
90
+ :return: None
91
+ """
92
+ file = path
93
+ if timestamp:
94
+ base_file = os.path.splitext(file)[0]
95
+ time_str = '_' + get_time_string()
96
+ ext = os.path.splitext(os.path.basename(file))[1]
97
+ file = base_file + time_str + ext
98
+
99
+ if verbose:
100
+ print('creating file %s' % file)
101
+
102
+ save_file = open(file, 'wb')
103
+ pickle.dump(data, save_file, protocol=protocol)
104
+ save_file.close()
105
+
106
+
107
+ def save_json(path, data, timestamp=False, verbose=True, indent=2):
108
+ """
109
+ Save data to Json format
110
+ :param path: full path of file where data will be pickled to
111
+ :param data: data to be pickled
112
+ :param timestamp: if true, the timestamp will be saved as part of the file name
113
+ :param verbose: if true, print information about file creation
114
+ :param indent: specify the width of the indent in the resulted Json file
115
+ :return: None
116
+ """
117
+ file = path
118
+ if timestamp:
119
+ base_file = os.path.splitext(file)[0]
120
+ time_str = '_' + get_time_string()
121
+ ext = os.path.splitext(os.path.basename(file))[1]
122
+ file = base_file + time_str + ext
123
+ if verbose:
124
+ print('creating file %s' % file)
125
+ outfile = open(file, 'w')
126
+ json.dump(data, outfile, indent=indent)
127
+ outfile.close()
128
+
129
+
130
+ def load_json(json_file):
131
+ """
132
+ load data from Json file
133
+ :param json_file: path of json file
134
+ :return: data stored in json file as python dictionary
135
+ """
136
+ load_file = open(json_file)
137
+ data = json.load(load_file)
138
+ load_file.close()
139
+ return data
140
+
141
+
142
+ def create_folder(path):
143
+ Path(path).mkdir(parents=True, exist_ok=True)
144
+
145
+
146
+
147
+ def glob_folder_filelist(path, file_type='', recursive=True):
148
+ """
149
+ utility function that walk through a given directory, and return list of files in the directory
150
+ :param path: the path of the directory
151
+ :param file_type: if not '', this function would only consider the file type specified by this parameter
152
+ :param recursive: if True, perform directory walk-fhrough recursively
153
+ :return absfile: a list containing absolute path of each file in the directory
154
+ :return base_files: a list containing base name of each file in the directory
155
+ """
156
+ if path[-1] != '/':
157
+ path = path +'/'
158
+ abs_files = []
159
+ base_files = []
160
+ patrn = '**' if recursive else '*'
161
+ glob_path = path + patrn
162
+ matches = glob.glob(glob_path, recursive=recursive)
163
+ for f in matches:
164
+ if os.path.isfile(f):
165
+ include = True
166
+ if len(file_type)>0:
167
+ ext = os.path.splitext(f)[1]
168
+ if ext[1:] != file_type:
169
+ include = False
170
+ if include:
171
+ abs_files.append(f)
172
+ base_files.append(os.path.basename(f))
173
+ return abs_files, base_files
174
+
175
+
176
+ def dir_compare(pathl, pathr):
177
+ files_pathl = set([f for f in listdir(pathl) if isfile(join(pathl, f))])
178
+ files_pathr = set([f for f in listdir(pathr) if isfile(join(pathr, f))])
179
+ return list(files_pathl-files_pathr), list(files_pathr-files_pathl)
180
+
181
+
182
+
183
+
184
+ def lr_dir_sync(pathl, pathr):
185
+ files_lrddiff, files_rldiff = project_utils.dir_compare(pathl, pathr)
186
+ for f in files_lrddiff:
187
+ scr = pathl + f
188
+ dst = pathr + f
189
+ print('copying file %s' % scr)
190
+ copyfile(scr, dst)
191
+
192
+
193
+
194
+ def copy_file_with_time(src_file, dst_file_name, des_path):
195
+ basename = os.path.splitext(os.path.basename(dst_file_name))[0]
196
+ ext_name = os.path.splitext(os.path.basename(dst_file_name))[1]
197
+ timestr = get_time_string()
198
+ des_name = '%s%s_%s%s' % (des_path, basename, timestr, ext_name)
199
+ # print(des_name)
200
+ copyfile(src_file, des_name)
201
+
202
+
203
+
204
+
205
+
206
+ def find_filesfromfolder(target_dir, containtext):
207
+ absnames, basenames = glob_folder_filelist(target_dir)
208
+ result_filelist = []
209
+ for absname, basename in zip(absnames, basenames):
210
+ if containtext in basename:
211
+ result_filelist.append(absname)
212
+ # result_filelist = [f for f in total_filelist if containtext in f]
213
+ return result_filelist
214
+
215
+
216
+ def cp_files_with_prefix(src_path, dst_path, prefix, ext):
217
+ abs_file_list, base_file_list = get_folder_filelist(src_path, file_type=ext)
218
+ # print(abs_file_list)
219
+ for src_file, base_file in zip(abs_file_list, base_file_list):
220
+ dst_file = dst_path + prefix + base_file
221
+ copyfile(src_file, dst_file)
222
+ return None
223
+
224
+
225
+
226
+ def mv_files_with_prefix(src_path, dst_path, prefix, ext):
227
+ abs_file_list, base_file_list = get_folder_filelist(src_path, file_type=ext)
228
+ # print(abs_file_list)
229
+ for src_file, base_file in zip(abs_file_list, base_file_list):
230
+ dst_file = dst_path + prefix + base_file
231
+ move(src_file, dst_file)
232
+ return None
233
+
234
+
235
+
236
+ def empty_folder(path):
237
+ if path[-1]!='*':
238
+ path = path + '*'
239
+ files = glob.glob(path)
240
+ for f in files:
241
+ os.remove(f)
242
+
243
+
244
+ def rescale(n, range1, range2):
245
+ if n>range1[1]: #or n<range1[0]:
246
+ n=range1[1]
247
+ if n<range1[0]:
248
+ n=range1[0]
249
+ delta1 = range1[1] - range1[0]
250
+ delta2 = range2[1] - range2[0]
251
+ return (delta2 * (n - range1[0]) / delta1) + range2[0]
252
+
253
+
254
+
255
+ def rmse(y_true, y_pred):
256
+ """
257
+ RMSE (Root Mean Square Error) evaluation function
258
+ :param y_true: label values
259
+ :param y_pred: prediction values
260
+ :return: RMSE value of the input prediction values, evaluated against the input label values
261
+ """
262
+ return np.sqrt(mean_squared_error(y_true, y_pred))
263
+
264
+
265
+
266
+
267
+ def str2date(date_str, dateformat='%Y-%m-%d'):
268
+ """
269
+ convert an input string in specified format into datetime format
270
+ :param date_str: the input string with certain specified format
271
+ :param dateformat: the format of the string which is used by the strptime function to do the type converson
272
+ :return dt_value: the datetime value that is corresponding to the input string and the specified format
273
+ """
274
+ dt_value = datetime.datetime.strptime(date_str, dateformat)
275
+ return dt_value
276
+
277
+
278
+ def isnotebook():
279
+ """
280
+ Determine if the current python file is a jupyter notebook (.ipynb) or a python script (.py)
281
+ :return: return True if the the current python file is a jupyter notebook, otherwise return False
282
+ """
283
+ try:
284
+ shell = get_ipython().__class__.__name__
285
+ if shell == 'ZMQInteractiveShell':
286
+ return True # Jupyter notebook
287
+ elif shell == 'TerminalInteractiveShell':
288
+ return False # Terminal running IPython
289
+ else:
290
+ return False # Other type (?)
291
+ except NameError:
292
+ return False
293
+
294
+
295
+
296
+ def list_intersection(left, right):
297
+ """
298
+ take two list as input, conver them into sets, calculate the intersection of the two sets, and return this as a list
299
+ :param left: the first input list
300
+ :param right: the second input list
301
+ :return: the intersection set of elements for both input list, as a list
302
+ """
303
+ left_set = set(left)
304
+ right_set = set(right)
305
+ return list(left_set.intersection(right_set))
306
+
307
+
308
+ def list_union(left, right):
309
+ """
310
+ take two list as input, conver them into sets, calculate the union of the two sets, and return this as a list
311
+ :param left: the first input list
312
+ :param right: the second input list
313
+ :return: the union set of elements for both input list, as a list
314
+ """
315
+ left_set = set(left)
316
+ right_set = set(right)
317
+ return list(left_set.union(right_set))
318
+
319
+
320
+ def list_difference(left, right):
321
+ """
322
+ take two list as input, conver them into sets, calculate the difference of the first set to the second set, and return this as a list
323
+ :param left: the first input list
324
+ :param right: the second input list
325
+ :return: the result of difference set operation on elements for both input list, as a list
326
+ """
327
+ left_set = set(left)
328
+ right_set = set(right)
329
+ return list(left_set.difference(right_set))
330
+
331
+
332
+ def is_listelements_identical(left, right):
333
+ equal_length = (len(left)==len(right))
334
+ zero_diff = (len(list_difference(left,right))==0)
335
+ return equal_length & zero_diff
336
+
337
+
338
+
339
+
340
+ def np_corr(a, b):
341
+ """
342
+ take two numpy arrays, and compute their correlation
343
+ :param a: the first numpy array input
344
+ :param b: the second numpy array input
345
+ :return: the correlation between the two input arrays
346
+ """
347
+ return pd.Series(a).corr(pd.Series(b))
348
+
349
+
350
+
351
+ def list_sort_values(a, ascending=True):
352
+ """
353
+ sort the value of a list in specified order
354
+ :param a: the input list
355
+ :param ascending: specified if the sorting is to be done in ascending or descending order
356
+ :return: the input list sorted in the specified order
357
+ """
358
+ return pd.Series(a).sort_values(ascending=ascending).tolist()
359
+
360
+
361
+ def get_rank(data):
362
+ """
363
+ convert the values of a list or array into ranked percentage values
364
+ :param data: the input data in the form of a list or an array
365
+ :return: the return ranked percentage values in numpy array
366
+ """
367
+ ranks = pd.Series(data).rank(pct=True).values
368
+ return ranks
369
+
370
+
371
+
372
+ def plot_feature_corr(df, features, figsize=(10,10), vmin=-1.0):
373
+ """
374
+ plot the pair-wise correlation matrix for specified features in a dataframe
375
+ :param df: the input dataframe
376
+ :param features: the list of features for which correlation matrix will be plotted
377
+ :param figsize: the size of the displayed figure
378
+ :param vmin: the minimum value of the correlation to be included in the plotting
379
+ :return: the pair-wise correlation values in the form of pandas dataframe, the figure will be plotted during the operation of this function.
380
+ """
381
+ val_corr = df[features].corr().fillna(0)
382
+ f, ax = plt.subplots(figsize=figsize)
383
+ sns.heatmap(val_corr, vmin=vmin, square=True)
384
+ return val_corr
385
+
386
+
387
+ def decision_to_prob(data):
388
+ """
389
+ convert output value of a sklearn classifier (i.e. ridge classifier) decision function into probability
390
+ :param data: output value of decision function in the form of a numpy array
391
+ :return: value of probability in the form of a numpy array
392
+ """
393
+ prob = np.exp(data) / np.sum(np.exp(data))
394
+ return prob
395
+
396
+
397
+ def np_describe(a):
398
+ """
399
+ provide overall statistic description of an input numpy value using the Describe method of Pandas Series
400
+ :param a: the input numpy array
401
+ :return: overall statistic description
402
+ """
403
+ return pd.Series(a.flatten()).describe()
404
+
405
+
406
+ def ks_2samp_selection(train_df, test_df, pval=0.1):
407
+ """
408
+ use scipy ks_2samp function to select features that are statistically similar between the input train and test dataframe.
409
+ :param train_df: the input train dataframe
410
+ :param test_df: the input test dataframe
411
+ :param pval: the p value threshold use to decide which features to be selected. Only features with value higher than the specified p value will be selected
412
+ :return train_df: the return train dataframe with selected features
413
+ :return test_df: the return test dataframe with selected features
414
+ """
415
+ list_p_value = []
416
+ for i in train_df.columns.tolist():
417
+ list_p_value.append(ks_2samp(train_df[i], test_df[i])[1])
418
+ Se = pd.Series(list_p_value, index=train_df.columns.tolist()).sort_values()
419
+ list_discarded = list(Se[Se < pval].index)
420
+ train_df = train_df.drop(columns=list_discarded)
421
+ test_df = test_df.drop(columns=list_discarded)
422
+ return train_df, test_df
423
+
424
+
425
+
426
+ def df_balance_sampling(df, class_feature, minor_class=1, sample_ratio=1):
427
+ """
428
+ :param df:
429
+ :param class_feature:
430
+ :param minor_class:
431
+ :param sample_ratio:
432
+ :return:
433
+ """
434
+ minor_df = df[df[class_feature] == minor_class]
435
+ major_df = df[df[class_feature] == (1 - minor_class)].sample(sample_ratio * len(minor_df))
436
+
437
+ res_df = minor_df.append(major_df)
438
+ res_df = res_df.sample(len(res_df)).reset_index(drop=True)
439
+ return res_df
440
+
441
+
442
+ def prob2acc(label, probs, p=0.5):
443
+ """
444
+ calculate accuracy score for probability predictions with given threshold, as part of the process, the input probability predictions will be converted into discrete binary predictions
445
+ :param label: labels used to evaluate accuracy score
446
+ :param probs: probability predictions for which accuracy score will be calculated
447
+ :param p: the threshold to be used for convert probabilites into discrete binary values 0 and 1
448
+ :return acc: the computed accuracy score
449
+ :return preds: predictions in discrete binary value
450
+ """
451
+
452
+ preds = (probs >= p).astype(np.uint8)
453
+ acc = accuracy_score(label, preds)
454
+ return acc, preds
455
+
456
+
457
+
458
+ def np_pearson(t,p):
459
+ vt = t - t.mean()
460
+ vp = p - p.mean()
461
+ top = np.sum(vt*vp)
462
+ bottom = np.sqrt(np.sum(vt**2)) * np.sqrt(np.sum(vp**2))
463
+ res = top/bottom
464
+ return res
465
+
466
+
467
+ def df_get_features_with_str(df, ptrn):
468
+ """
469
+ extract list of feature names from a data frame that contain the specified regular expression pattern
470
+ :param df: the input dataframe of which features name to be analysed
471
+ :param ptrn: the specified regular expression pattern
472
+ :return: list of feature names that contained the specified regular expression
473
+ """
474
+ return [col for col in df.columns.tolist() if len(re.findall(ptrn, col)) > 0]
475
+
476
+
477
+ def df_fillna_with_other(df, src_feature, dst_feature):
478
+ """
479
+ fill the NA values of a specified feature in a dataframe with values of another feature from the same row.
480
+ :param df: the input dataframe
481
+ :param src_feature: the specified feature of which NA value will be filled
482
+ :param dst_feature: the feature of which values will be used
483
+ :return: a dataframe with the specified feature's NA value being filled by values from the "dst_feature"
484
+ """
485
+ src_vals = df[src_feature].values
486
+ dst_vals = df[dst_feature].values
487
+ argwhere_nan = np.argwhere(np.isnan(dst_vals)).flatten()
488
+ dst_vals[argwhere_nan] = src_vals[argwhere_nan]
489
+ df[dst_feature] = dst_vals
490
+ return df
491
+
492
+
493
+
494
+ def plot_prediction_prob(y_pred_prob):
495
+ """
496
+ plot probability prediction values using histrogram
497
+ :param y_pred_prob: the probability prediction values to be plotted
498
+ :return: None, the plot will be plotted during the operation of the function.
499
+ """
500
+ prob_series = pd.Series(data=y_pred_prob)
501
+ prob_series.name = 'prediction probability'
502
+ prob_series.plot(kind='hist', figsize=(15, 5), bins=50)
503
+ plt.show()
504
+ print(prob_series.describe())
505
+
506
+
507
+
508
+
509
+
510
+ def df_traintest_split(df, split_var, seed=None, train_ratio=0.75):
511
+ """
512
+ perform train test split on a specified feature on a given dataframe wwith specified train ratio. Unique value of the specified feature will only present on either the resulted train or the test dataframe
513
+ :param df: the input dataframe to be split
514
+ :param split_var: the feature to be used as unique value to perform the split
515
+ :param seed: the random used to facilitate the train test split
516
+ :param train_ratio: the ratio of data to be split into the resulted train dataframe.
517
+ :return train_df: the resulted train dataframe after the split
518
+ :return test_df: the resulted test dataframe after the split
519
+ """
520
+ sv_list = df[split_var].unique().tolist()
521
+ train_length = int(len(sv_list) * train_ratio)
522
+ train_siv_list = pd.Series(df[split_var].unique()).sample(train_length, random_state=seed)
523
+ train_idx = df.loc[df[split_var].isin(train_siv_list)].index.values
524
+ test_idx = df.iloc[df.index.difference(train_idx)].index.values
525
+ train_df = df.loc[train_idx].copy().reset_index(drop=True)
526
+ test_df = df.loc[test_idx].copy().reset_index(drop=True)
527
+ return train_df, test_df
528
+
529
+
530
+
531
+ # https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
532
+ def reduce_mem_usage(df, verbose=True, exceiptions=[]):
533
+ """ iterate through all the columns of a dataframe and modify the data type
534
+ to reduce memory usage.
535
+ """
536
+ np_input = False
537
+ if isinstance(df, np.ndarray):
538
+ np_input = True
539
+ df = pd.DataFrame(data=df)
540
+
541
+ start_mem = df.memory_usage().sum() / 1024 ** 2
542
+ col_id = 0
543
+ print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
544
+ for col in df.columns:
545
+ if verbose: print('doing %d: %s' % (col_id, col))
546
+ col_type = df[col].dtype
547
+ try:
548
+ if (col_type != object) & (col not in exceiptions):
549
+ c_min = df[col].min()
550
+ c_max = df[col].max()
551
+ if str(col_type)[:3] == 'int':
552
+ if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
553
+ df[col] = df[col].astype(np.int8)
554
+ elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
555
+ df[col] = df[col].astype(np.int16)
556
+ elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
557
+ df[col] = df[col].astype(np.int32)
558
+ elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
559
+ df[col] = df[col].astype(np.int64)
560
+ else:
561
+ if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
562
+ # df[col] = df[col].astype(np.float16)
563
+ # elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
564
+ df[col] = df[col].astype(np.float32)
565
+ else:
566
+ df[col] = df[col].astype(np.float64)
567
+ # else:
568
+ # df[col] = df[col].astype('category')
569
+ # pass
570
+ except:
571
+ pass
572
+ col_id += 1
573
+ end_mem = df.memory_usage().sum() / 1024 ** 2
574
+ print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
575
+ print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
576
+
577
+ if np_input:
578
+ return df.values
579
+ else:
580
+ return df
581
+
582
+
583
+
584
+ def get_xgb_featimp(model):
585
+ imp_type = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
586
+ imp_dict = {}
587
+ try:
588
+ bst = model.get_booster()
589
+ except:
590
+ bst = model
591
+ feature_names = bst.feature_names
592
+ for impt in imp_type:
593
+ imp_dict[impt] = []
594
+ scores = bst.get_score(importance_type=impt)
595
+ for feature in feature_names:
596
+ if feature in scores.keys():
597
+ imp_dict[impt].append(scores[feature])
598
+ else:
599
+ imp_dict[impt].append(np.nan)
600
+ imp_df = pd.DataFrame(index=bst.feature_names, data=imp_dict)
601
+ return imp_df
602
+
603
+
604
+ def get_df_rankavg(df):
605
+ idx = df.index
606
+ cols = df.columns.tolist()
607
+ rankavg_dict = {}
608
+ for col in cols:
609
+ rankavg_dict[col]=df[col].rank(pct=True).tolist()
610
+ rankavg_df = pd.DataFrame(index=idx, columns=cols, data=rankavg_dict)
611
+ rankavg_df['rankavg'] = rankavg_df.mean(axis=1)
612
+ return rankavg_df.sort_values(by='rankavg', ascending=False)
613
+
614
+
615
+ def get_list_gmean(lists):
616
+ out = np.zeros((len(lists[0]), len(lists)))
617
+ for i in range(0, len(lists)):
618
+ out[:,i] = lists[i]
619
+ gmean_out = gmean(out, axis=1)
620
+ return gmean_out
621
+
622
+
623
+
624
+ def generate_nwise_combination(items, n=2):
625
+ return list(itertools.combinations(items, n))
626
+
627
+
628
+ def pairwise_feature_generation(df, feature_list, operator='addition', verbose=True):
629
+ feats_pair = generate_nwise_combination(feature_list, 2)
630
+ result_df = pd.DataFrame()
631
+ for pair in feats_pair:
632
+ if verbose:
633
+ print('generating %s of %s and %s' % (operator, pair[0], pair[1]))
634
+ if operator == 'addition':
635
+ feat_name = pair[0] + '_add_' + pair[1]
636
+ result_df[feat_name] = df[pair[0]] + df[pair[1]]
637
+ elif operator == 'multiplication':
638
+ feat_name = pair[0] + '_mulp_' + pair[1]
639
+ result_df[feat_name] = df[pair[0]] * df[pair[1]]
640
+ elif operator == 'division':
641
+ feat_name = pair[0] + '_div_' + pair[1]
642
+ result_df[feat_name] = df[pair[0]] / df[pair[1]]
643
+ return result_df
644
+
645
+
646
+ def try_divide(x, y, val=0.0):
647
+ """
648
+ try to perform division between two number, and return a default value if division by zero is detected
649
+ :param x: the number to be used as dividend
650
+ :param y: the number to be used as divisor
651
+ :param val: the default output value
652
+ :return: the output value, the default value of val will be returned if division by zero is detected
653
+ """
654
+ if y != 0.0:
655
+ val = float(x) / y
656
+ return val
657
+
658
+
659
+ def series_reverse_cumsum(a):
660
+ return a.fillna(0).values[::-1].cumsum()[::-1]
661
+
662
+
663
+ def get_array_sharpe(values):
664
+ return values.mean()/values.std()
665
+
666
+
667
+ #### NumerDash specific functions ###
668
+
669
+ def calculate_rounddailysharpe_dashboard(df, lastround, earliest_round, score='corr'):
670
+ if score=='corr':
671
+ target = 'corr_sharpe'
672
+ elif score == 'corr_pct':
673
+ target = 'corr_pct_sharpe'
674
+ elif score=='mmc':
675
+ target = 'mmc_sharpe'
676
+ elif score=='mmc_pct':
677
+ target = 'mmc_pct_sharpe'
678
+ elif score=='corrmmc':
679
+ target = 'corrmmc_sharpe'
680
+ elif score=='corr2mmc':
681
+ target = 'corr2mmc_sharpe'
682
+ elif score=='cmavg_pct':
683
+ target = 'cmavgpct_sharpe'
684
+ elif score=='c2mavg_pct':
685
+ target = 'c2mavcpct_sharpe'
686
+
687
+ mean_feat = 'avg_sharpe'
688
+ sos_feat = 'sos'
689
+ df = df[(df['roundNumber'] >= earliest_round) & (df['roundNumber'] <= lastround)]
690
+ res = df.groupby(['model', 'roundNumber', 'group'])[score].apply(
691
+ lambda x: get_array_sharpe(x)).reset_index(drop=False)
692
+ res = res.rename(columns={score: target}).sort_values('roundNumber', ascending=False)
693
+ res = res.pivot(index=['model', 'group'], columns='roundNumber', values=target)
694
+ res.columns.name = ''
695
+ cols = [i for i in res.columns[::-1]]
696
+ res = res[cols]
697
+ res[mean_feat] = res[cols].mean(axis=1)
698
+ res[sos_feat] = res[cols].apply(lambda x: get_array_sharpe(x), axis=1)
699
+ res = res.drop_duplicates(keep='first').sort_values(by=sos_feat, ascending=False)
700
+ res.reset_index(drop=False, inplace=True)
701
+ return res[['model', 'group', sos_feat, mean_feat]+cols]
702
+
703
+
704
+
705
+ def groupby_agg_execution(agg_recipies, df, verbose=True):
706
+ result_dfs = dict()
707
+ for groupby_cols, features, aggs in agg_recipies:
708
+ group_object = df.groupby(groupby_cols)
709
+ groupby_key = '_'.join(groupby_cols)
710
+ if groupby_key not in list(result_dfs.keys()):
711
+ result_dfs[groupby_key] = pd.DataFrame()
712
+ for feature in features:
713
+ rename_col = feature
714
+ for agg in aggs:
715
+ if isinstance(agg, dict):
716
+ agg_name = list(agg.keys())[0]
717
+ agg_func = agg[agg_name]
718
+ else:
719
+ agg_name = agg
720
+ agg_func = agg
721
+ if agg_name=='count':
722
+ groupby_aggregate_name = '{}_{}'.format(groupby_key, agg_name)
723
+ else:
724
+ groupby_aggregate_name = '{}_{}_{}'.format(groupby_key, feature, agg_name)
725
+ verbose and print(f'generating statistic {groupby_aggregate_name}')
726
+ groupby_res_df = group_object[feature].agg(agg_func).reset_index(drop=False)
727
+ groupby_res_df = groupby_res_df.rename(columns={rename_col: groupby_aggregate_name})
728
+ if len(result_dfs[groupby_key]) == 0:
729
+ result_dfs[groupby_key] = groupby_res_df
730
+ else:
731
+ result_dfs[groupby_key][groupby_aggregate_name] = groupby_res_df[groupby_aggregate_name]
732
+ return result_dfs
733
+
734
+
735
+ def get_latest_round_id():
736
+ try:
737
+ all_competitions = numerapi_utils.get_competitions()
738
+ latest_comp_id = all_competitions[0]['number']
739
+ except:
740
+ print('calling api unsuccessulf, using downloaded data to get the latest round')
741
+ local_data = load_data(project_config.DASHBOARD_MODEL_RESULT_FILE)
742
+ latest_comp_id = local_data['roundNumber'].max()
743
+ return int(latest_comp_id)
744
+ # except:
745
+
746
+ latest_round = get_latest_round_id()
747
+
748
+
749
+
750
+
751
+ def update_numerati_data(url=project_config.NUMERATI_URL, save_path=project_config.FEATURE_PATH):
752
+ content = requests.get(url).content
753
+ data = pd.read_csv(io.StringIO(content.decode('utf-8')))
754
+ save_file = os.path.join(save_path, 'numerati_data.pkl')
755
+ pickle_data(save_file, data)
756
+ return data
757
+
758
+
759
+
760
+
761
+ def get_model_group(model_name):
762
+ cat_name = 'other'
763
+ if model_name in project_config.MODEL_NAMES+project_config.NEW_MODEL_NAMES:
764
+ cat_name = 'yx'
765
+ elif model_name in project_config.TOP_LB:
766
+ cat_name = 'top_corr'
767
+ elif model_name in project_config.IAAI_MODELS:
768
+ cat_name = 'iaai'
769
+ elif model_name in project_config.ARBITRAGE_MODELS:
770
+ cat_name = 'arbitrage'
771
+ elif model_name in project_config.MCV_MODELS:
772
+ cat_name = 'mcv'
773
+ # elif model_name in project_config.MM_MODELS:
774
+ # cat_name = 'mm'
775
+ elif model_name in project_config.BENCHMARK_MODELS:
776
+ cat_name = 'benchmark'
777
+ elif model_name in project_config.TP3M:
778
+ cat_name = 'top_3m'
779
+ elif model_name in project_config.TP1Y:
780
+ cat_name = 'top_1y'
781
+ return cat_name
782
+
783
+
784
+ def get_dashboard_data_status():
785
+ dashboard_data_tstr = 'NA'
786
+ nmtd_tstr = 'NA'
787
+ try:
788
+ dashboard_data_t = datetime.datetime.utcfromtimestamp(os.path.getctime(project_config.DASHBOARD_MODEL_RESULT_FILE))
789
+ dashboard_data_tstr = dashboard_data_t.strftime(project_config.DATETIME_FORMAT2)
790
+ except Exception as e:
791
+ print(e)
792
+ pass
793
+ try:
794
+ nmtd_t = datetime.datetime.utcfromtimestamp(os.path.getctime(project_config.NUMERATI_FILE))
795
+ nmtd_tstr = nmtd_t.strftime(project_config.DATETIME_FORMAT2)
796
+ except Exception as e:
797
+ print(e)
798
+ pass
799
+ return dashboard_data_tstr, nmtd_tstr
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+