KotVasily commited on
Commit
d39b93e
·
1 Parent(s): 5984960

Add application file

Browse files
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from tqdm import tqdm
4
+ from model import Model
5
+ tqdm.pandas()
6
+
7
+ def get_predict(stoc_name, pred_days, model_name, add_smoothing):
8
+ '''Эта функцыю делаем прогнозы, выводит график, и информацыю о важности признаков'''
9
+ model = Model(stoc_name, model_name)
10
+ data, string = model.predict(pred_days)
11
+ p = model.plot_predict(data, add_smoothing)
12
+ return p, string
13
+
14
+ stock_list = ['VKCO', 'SBER', 'CHMF', 'MTSS', 'SMLT', 'AGRO', 'SIBN', 'YNDX'] # Акции для прогноза
15
+
16
+ with gr.Blocks() as demo:
17
+ gr.Markdown("# Приложение предсказания цен акций")
18
+
19
+ with gr.Row():
20
+ with gr.Column(scale=1):
21
+ stock_dropdown = gr.Dropdown(choices=stock_list, label="Выберите акцию")
22
+ slider = gr.Slider(1, 7, step=1, label="Выберите длину предсказания (дней)")
23
+ model_dropdown = gr.Dropdown(choices=['LinearRegression', 'NN', 'LGB'], label="Выберите модель")
24
+ checkbox = gr.Checkbox(label="Добавить сглаженное предсказание", value=True)
25
+
26
+ with gr.Column(scale=2):
27
+ output_plot = gr.Plot()
28
+ output_md = gr.Markdown()
29
+
30
+ submit_button = gr.Button("Сгенерировать предсказание")
31
+ submit_button.click(fn=get_predict, inputs=[stock_dropdown, slider, model_dropdown, checkbox], outputs=[output_plot, output_md])
32
+
33
+ demo.launch()
model.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import tensorflow as tf
3
+ import pickle
4
+ import plotly.express as px
5
+ import os
6
+ import numpy as np
7
+
8
+ from stock_and import GetNewData
9
+
10
+ class Model:
11
+ def __init__(self, stock_name, model_name) -> None:
12
+ """Этот класс делает прогнозирования, выводит графики, рассчитывает важность признаков."""
13
+ self.stock_name = stock_name # Название
14
+ self.features = ['lag_25', 'lag_34','lag_33','lag_26','lag_32','lag_31','lag_30','lag_29','lag_27','sentiment_neutral', 'lag_28',
15
+ 'sentiment_positive','sentiment_negative', 'month','day'] # Фичи для модели
16
+ self.model_name = model_name # Название модели
17
+
18
+ # Загрузка моделей
19
+ if model_name == 'NN':
20
+ self.model = tf.keras.models.load_model(f'models/nn_predict_1day_ver2_{stock_name}.h5', custom_objects={'mae': tf.keras.metrics.MeanAbsoluteError()})
21
+
22
+ if model_name == 'LinearRegression':
23
+ with open(os.path.join('models', f'linear_predict_1day_ver2_{stock_name}.pkl'), 'rb') as f:
24
+ self.model = pickle.load(f)
25
+
26
+ if model_name == 'LGB':
27
+ with open(os.path.join('models', f'lgb_predict_1day_ver2_{stock_name}.pkl'), 'rb') as f:
28
+ self.model = pickle.load(f)
29
+
30
+ def generate_dataset(self, stock_name, num_day):
31
+ """Парсим датасет, добавляем новые row для прогонза"""
32
+ merged_df, string = GetNewData(stock_name).get_full_data() # Тут мы получаем датасет с новостями и акциями
33
+
34
+ # Создаем датасет с дополнительными строками
35
+ last_date = merged_df['DATE'].max()
36
+ new_date_rng = pd.date_range(start=last_date + pd.Timedelta(hours=1), periods=24 * num_day , freq='H')
37
+
38
+ new_df = pd.DataFrame(new_date_rng, columns=['DATE'])
39
+ new_df['month'] = new_df['DATE'].dt.month
40
+ new_df['day'] = new_df['DATE'].dt.day
41
+
42
+ for c in self.features:
43
+ last_value = merged_df[c].values[-24 * num_day :]
44
+ new_df[c] = last_value
45
+
46
+ # Объединяем датасет
47
+ merged_df_new = pd.concat([merged_df, new_df[self.features+['DATE']]], ignore_index=True)
48
+ return merged_df_new, string
49
+
50
+ def predict(self, num_day):
51
+ # Создаем прогнозы, важности признаков
52
+ merged_df, string = self.generate_dataset(self.stock_name, num_day) # Парсим датасет
53
+
54
+ if self.model_name == 'NN':
55
+ importance = np.abs(self.model.layers[0].get_weights()[0])
56
+ importance = np.mean(importance, axis=1)
57
+ df_weighted = merged_df[self.features].ffill().bfill()[-num_day*24:] * importance
58
+
59
+ if self.model_name == 'LinearRegression':
60
+ df_weighted = merged_df[self.features].ffill().bfill()[-num_day*24:] * self.model.coef_
61
+
62
+ if self.model_name == 'LGB':
63
+ df_weighted = merged_df[self.features].ffill().bfill()[-num_day*24:] * self.model.feature_importances_
64
+
65
+ average_values = df_weighted.mean(axis=0).abs().sort_values(ascending=False)
66
+ average_values_filtered = average_values.drop('lag_25')
67
+
68
+ total_sum = average_values_filtered.sum()
69
+ average_values_percentage = (average_values_filtered / total_sum) * 100
70
+
71
+ string += '\n Самые полезные признаки для прогнозов: \n'
72
+
73
+ for f, v in zip(average_values_percentage.index, average_values_percentage.values):
74
+ string += f'- {f}: важность = {v:.2f}%\n'
75
+
76
+ if self.model_name in ['LinearRegression', 'LGB']:
77
+ return pd.DataFrame({
78
+ 'predict': self.model.predict(merged_df[self.features].ffill().bfill().values),
79
+ 'DATE': merged_df['DATE'].values,
80
+ 'CLOSE': merged_df['CLOSE'].values
81
+ }), string
82
+
83
+ else:
84
+ return pd.DataFrame({
85
+ 'predict': self.model.predict(merged_df[self.features].ffill().bfill().values)[:, 0],
86
+ 'DATE': merged_df['DATE'].values,
87
+ 'CLOSE': merged_df['CLOSE'].values
88
+ }), string
89
+
90
+ def plot_predict(self, predict, add_smoothing):
91
+ predict = predict[-24*12:]
92
+
93
+ scaling_factor = predict['CLOSE'].mean() / predict['predict'].mean()
94
+ scaled_preds = predict['predict'] * scaling_factor
95
+
96
+ fig = px.line(predict, x=predict.DATE, y='CLOSE', labels={'value': 'Цена'}, title='CLOSE')
97
+ fig.add_scatter(x=predict.DATE, y=scaled_preds, mode='lines', name='Predict', opacity=0.7)
98
+
99
+ if add_smoothing:
100
+ smoothed_preds = pd.Series(scaled_preds).ewm(3).mean()
101
+ fig.add_scatter(x=predict.DATE, y=smoothed_preds, mode='lines', name='Сглаженные предсказания', opacity=0.7)
102
+
103
+ fig.update_layout(xaxis=dict(type='category'))
104
+ return fig
models/lgb_predict_1day_ver2_AGRO.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12eac4a543adda6d7e50335fd4c742c75bcb974b662916084057ae3b6080d364
3
+ size 111059
models/lgb_predict_1day_ver2_CHMF.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:363d694e123fa2678c57cc9cc7d2cc08aa18bf6cb8cd27f32504ea3a643a86d6
3
+ size 108149
models/lgb_predict_1day_ver2_MTSS.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0efd081b82763fca72d13ef27090611d07204bf91587d04df28ec2ab5dba813b
3
+ size 112768
models/lgb_predict_1day_ver2_SBER.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b184feb04c371e6f08071de17c51445a28cee988cff9c9f8b7654ab3911b3b
3
+ size 112729
models/lgb_predict_1day_ver2_SIBN.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdecdfd8f57c50eb3cfefcf85c79b4e560bc26c184cb9b1ad74729fcae40edc1
3
+ size 112621
models/lgb_predict_1day_ver2_SMLT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed679e9165076c8b13f80740f5f5783a8f991d091e7ef3f037ccbb970286bc9
3
+ size 112694
models/lgb_predict_1day_ver2_VKCO.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ee5fa66fbd15533f9c7cc5441cde36489754ebbd35228228bf18e677ac9fce3
3
+ size 113490
models/lgb_predict_1day_ver2_YNDX.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d72c9985aa69d15b9d374928fde8da467e479142dddbcd7db420ae485ea82efe
3
+ size 111507
models/linear_predict_1day_ver2_AGRO.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:351f3dab393b18467096c8d7b60328d015a95664ea0a6803eb176687ef4630bc
3
+ size 963
models/linear_predict_1day_ver2_CHMF.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3d43e65ea82c9a82a55aff31a8912ed96f2130644f0fe699c988ada0445692c
3
+ size 963
models/linear_predict_1day_ver2_MTSS.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:219e726dfdc4c31bc340629a1d8d27511c31770dec679a5a2c3c91ecdc01265c
3
+ size 963
models/linear_predict_1day_ver2_SBER.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d9702e128cf3bcf6f443d0c43edc252ae49c9ed43154359119a73f0c8a9b57d
3
+ size 963
models/linear_predict_1day_ver2_SIBN.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a4f667e472c79b660aa6699fafeb737f57d298e607a56b665fa4fdc0bb2e6cf
3
+ size 963
models/linear_predict_1day_ver2_SMLT.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f202f365d672df86daa3dc506d8f9f1107c5decba08500cee86896e54c37d5
3
+ size 963
models/linear_predict_1day_ver2_VKCO.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:137d9f838962f785be93d826d7aee964dd12d260d7a7b55e1c5d382057c16b05
3
+ size 963
models/linear_predict_1day_ver2_YNDX.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed581f0da796a8f864ecd7a35d82fb9ab18f374ee1cb2fc441206643d2e801dd
3
+ size 963
models/nn_predict_1day_ver2_AGRO.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79ef5af4053c6f8568071ecb1312360a53e5167782b72b013b5ef9365e57a95e
3
+ size 23920
models/nn_predict_1day_ver2_CHMF.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9eee0d24f5e43527e7b8b4d8ba0c35ecbb3001c285436a659c1b357182effd1
3
+ size 23920
models/nn_predict_1day_ver2_MTSS.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd3ff689591aff665292602995a5b5b74effea3a93d2f96560059e28487b80e
3
+ size 23920
models/nn_predict_1day_ver2_SBER.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bebb94e7996a989ffb680b359df91d27892faecd20f5a056f0e860c50752d8a7
3
+ size 23920
models/nn_predict_1day_ver2_SIBN.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9659fdb354f975c47e3fbc08b7cda9836e11a740b729eb88ccc1661183ff2527
3
+ size 23920
models/nn_predict_1day_ver2_SMLT.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:847847ee5298aab687d3543cf0ae8ddca9b0075fdd0e56d31c37ec59dcbde815
3
+ size 23920
models/nn_predict_1day_ver2_VKCO.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a6b68dcbabf1775b27d57c018e6ab2b730970a460445fe3ae51039944b7ad96
3
+ size 23920
models/nn_predict_1day_ver2_YNDX.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a229e3e67e112bbbf121b048e3a69a86933584b34f9275d9e3679c1064ea66d9
3
+ size 23920
news.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ import time
4
+ import pymorphy2
5
+ import re
6
+
7
+ from datetime import datetime, timedelta
8
+ from transformers import pipeline
9
+ from bs4 import BeautifulSoup
10
+
11
+ class NewsData:
12
+ def __init__(self) -> None:
13
+ """
14
+ Парсер статей с ru.investing.com.
15
+ """
16
+ self.urls = [
17
+ ("https://ru.investing.com/news/forex-news/", "forex-news"),
18
+ ("https://ru.investing.com/news/commodities-news/", "commodities-news"),
19
+ ]
20
+ self.headers = {
21
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
22
+ 'Accept-Language': 'en-US,en;q=0.5',
23
+ 'Referer': 'http://google.com'
24
+ }
25
+
26
+ self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
27
+
28
+ def get_data(self):
29
+ # Парсим сайты за последнии 7 дней
30
+ date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
31
+ articles_data = []
32
+
33
+ for base_url, tag in self.urls:
34
+ page = 1
35
+ while True:
36
+ url = f"{base_url}{page}/"
37
+ response = requests.get(url, headers=self.headers)
38
+
39
+ if response.status_code == 200:
40
+ soup = BeautifulSoup(response.content, "html.parser")
41
+ articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
42
+
43
+ daily_count = 0
44
+
45
+ for article in articles:
46
+ title = article.text.strip()
47
+ date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"})
48
+ url = article["href"]
49
+
50
+ if date_tag:
51
+ publish_date_str = date_tag["datetime"]
52
+ publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S")
53
+
54
+ articles_data.append([title, publish_date_str, tag, url])
55
+ daily_count += 1
56
+
57
+ page += 1
58
+
59
+ if publish_date < date_limit:
60
+ print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.")
61
+ print(f"Общее количество скачанных статей: {len(articles_data)}")
62
+ break
63
+ else:
64
+ if response.status_code == 404:
65
+ break
66
+
67
+ print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}")
68
+ time.sleep(5)
69
+
70
+ return self.get_proccesing_news(articles_data)
71
+
72
+ def clean_text(self, text):
73
+ text = re.sub(r'[^а-яА-Я\s]', '', text)
74
+ text = text.lower()
75
+ text = ' '.join([word for word in text.split() if len(word) >= 3])
76
+ text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()])
77
+ return text
78
+
79
+ def get_proccesing_news(self, articles_data):
80
+ # Обрабатываем полученные данные, добавляем сентиментальный анализ.
81
+ sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt")
82
+
83
+ news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url'])
84
+
85
+ news['title'] = news['title'].ffill().map(self.clean_text)
86
+ news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label'])
87
+ news = pd.get_dummies(news, columns=['sentiment'])
88
+
89
+ news['DATE'] = pd.to_datetime(news['DATE'])
90
+ news['day'] = news['DATE'].dt.day
91
+ news['year'] = news['DATE'].dt.year
92
+ news['month'] = news['DATE'].dt.month
93
+ news['hour'] = news['DATE'].dt.hour
94
+ return news
stock_and.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import time
4
+ import apimoex
5
+
6
+ from datetime import datetime, timedelta
7
+ from news import NewsData
8
+
9
+ class GetNewData:
10
+ def __init__(self, stock_name) -> None:
11
+ """Этот класс парсит данные"""
12
+ self.news = NewsData()
13
+ self.stock_name = stock_name # Название акции
14
+
15
+ def downolad_stock(self):
16
+ # Исправляем название акции, просто акция поменяла свое названия
17
+ if self.stock_name == 'YNDX':
18
+ self.stock_name = 'YDEX'
19
+
20
+ # Тут мы будем делать до 5 итерацыя, если возниканет ошибка с сервером московской биржы, иногда он просто не находит акции
21
+ for _ in range(5):
22
+ try:
23
+ # Дата с которой будет начинатся наш датасет
24
+ new_date = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=24*30)
25
+ new_date = str(new_date.strftime('%Y-%m-%d %H:%M'))
26
+
27
+ # Запросы к московской бирже, и создания датасета
28
+ with requests.Session() as session:
29
+ # Получаем данные с биржи
30
+ market_data = apimoex.get_board_candles(session, security=self.stock_name, market='shares', columns=['close', 'begin', 'volume'],
31
+ start=new_date, interval=60)
32
+
33
+ df = pd.DataFrame(market_data)
34
+ df.columns = ['CLOSE', 'DATE', 'VOL']
35
+
36
+ df['DATE'] = pd.to_datetime(df['DATE'])
37
+ df['day'] = df['DATE'].dt.day
38
+ df['year'] = df['DATE'].dt.year
39
+ df['month'] = df['DATE'].dt.month
40
+ df['hour'] = df['DATE'].dt.hour
41
+
42
+ # Генерим лаги для модели
43
+ for i in range(1, 10+1):
44
+ df[f'lag_{i+24}'] = df['CLOSE'].shift(i)
45
+
46
+ df['lag_24'] = df['CLOSE']
47
+ break
48
+ except:
49
+ # Если возникнет ошибка, то мы будем ждать 5 секунд и повторять все действия снова
50
+ time.sleep(5)
51
+ return df
52
+
53
+ def get_full_data(self):
54
+ """Функция для получения полного датасета"""
55
+ articles_data = self.news.get_data() # Получаем данные о новостями
56
+ stock = self.downolad_stock() # Получаем данные о биржи
57
+
58
+ # Обеденяем датасеты
59
+ news_ = articles_data.drop(columns=['DATE']).groupby(['day', 'month', 'year', 'hour']).agg({
60
+ 'sentiment_negative': 'mean',
61
+ 'sentiment_neutral': 'mean',
62
+ 'sentiment_positive': 'mean',
63
+ 'url': lambda x: list(x),
64
+ 'title': lambda x: list(x)
65
+ }).reset_index()
66
+
67
+ merged_df = pd.merge(stock, news_, how='left', on=['year', 'month', 'day', 'hour'], suffixes=('', '_y')).bfill().ffill()
68
+ merged_df = merged_df.sort_values(by=['DATE'])
69
+
70
+ # Расчитываем важность статей
71
+ merged_df['super'] = merged_df['sentiment_positive'] + merged_df['sentiment_neutral']
72
+
73
+ string = ''
74
+ string += '\n Самые полезные статьи для прогнозов: \n'
75
+
76
+ for url, v, title in merged_df[-72:].sort_values('super', ascending=False)[['url', 'super', 'title']].values[:10]:
77
+ for u, t in zip(url, title):
78
+ string += f'- [New: {t}]({u}), важность признака: {v} \n'
79
+
80
+ return merged_df, string
81
+