Spaces:
Sleeping
Sleeping
Add application file
Browse files- app.py +33 -0
- model.py +104 -0
- models/lgb_predict_1day_ver2_AGRO.pkl +3 -0
- models/lgb_predict_1day_ver2_CHMF.pkl +3 -0
- models/lgb_predict_1day_ver2_MTSS.pkl +3 -0
- models/lgb_predict_1day_ver2_SBER.pkl +3 -0
- models/lgb_predict_1day_ver2_SIBN.pkl +3 -0
- models/lgb_predict_1day_ver2_SMLT.pkl +3 -0
- models/lgb_predict_1day_ver2_VKCO.pkl +3 -0
- models/lgb_predict_1day_ver2_YNDX.pkl +3 -0
- models/linear_predict_1day_ver2_AGRO.pkl +3 -0
- models/linear_predict_1day_ver2_CHMF.pkl +3 -0
- models/linear_predict_1day_ver2_MTSS.pkl +3 -0
- models/linear_predict_1day_ver2_SBER.pkl +3 -0
- models/linear_predict_1day_ver2_SIBN.pkl +3 -0
- models/linear_predict_1day_ver2_SMLT.pkl +3 -0
- models/linear_predict_1day_ver2_VKCO.pkl +3 -0
- models/linear_predict_1day_ver2_YNDX.pkl +3 -0
- models/nn_predict_1day_ver2_AGRO.h5 +3 -0
- models/nn_predict_1day_ver2_CHMF.h5 +3 -0
- models/nn_predict_1day_ver2_MTSS.h5 +3 -0
- models/nn_predict_1day_ver2_SBER.h5 +3 -0
- models/nn_predict_1day_ver2_SIBN.h5 +3 -0
- models/nn_predict_1day_ver2_SMLT.h5 +3 -0
- models/nn_predict_1day_ver2_VKCO.h5 +3 -0
- models/nn_predict_1day_ver2_YNDX.h5 +3 -0
- news.py +94 -0
- stock_and.py +81 -0
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from tqdm import tqdm
|
4 |
+
from model import Model
|
5 |
+
tqdm.pandas()
|
6 |
+
|
7 |
+
def get_predict(stoc_name, pred_days, model_name, add_smoothing):
|
8 |
+
'''Эта функцыю делаем прогнозы, выводит график, и информацыю о важности признаков'''
|
9 |
+
model = Model(stoc_name, model_name)
|
10 |
+
data, string = model.predict(pred_days)
|
11 |
+
p = model.plot_predict(data, add_smoothing)
|
12 |
+
return p, string
|
13 |
+
|
14 |
+
stock_list = ['VKCO', 'SBER', 'CHMF', 'MTSS', 'SMLT', 'AGRO', 'SIBN', 'YNDX'] # Акции для прогноза
|
15 |
+
|
16 |
+
with gr.Blocks() as demo:
|
17 |
+
gr.Markdown("# Приложение предсказания цен акций")
|
18 |
+
|
19 |
+
with gr.Row():
|
20 |
+
with gr.Column(scale=1):
|
21 |
+
stock_dropdown = gr.Dropdown(choices=stock_list, label="Выберите акцию")
|
22 |
+
slider = gr.Slider(1, 7, step=1, label="Выберите длину предсказания (дней)")
|
23 |
+
model_dropdown = gr.Dropdown(choices=['LinearRegression', 'NN', 'LGB'], label="Выберите модель")
|
24 |
+
checkbox = gr.Checkbox(label="Добавить сглаженное предсказание", value=True)
|
25 |
+
|
26 |
+
with gr.Column(scale=2):
|
27 |
+
output_plot = gr.Plot()
|
28 |
+
output_md = gr.Markdown()
|
29 |
+
|
30 |
+
submit_button = gr.Button("Сгенерировать предсказание")
|
31 |
+
submit_button.click(fn=get_predict, inputs=[stock_dropdown, slider, model_dropdown, checkbox], outputs=[output_plot, output_md])
|
32 |
+
|
33 |
+
demo.launch()
|
model.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import tensorflow as tf
|
3 |
+
import pickle
|
4 |
+
import plotly.express as px
|
5 |
+
import os
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
from stock_and import GetNewData
|
9 |
+
|
10 |
+
class Model:
|
11 |
+
def __init__(self, stock_name, model_name) -> None:
|
12 |
+
"""Этот класс делает прогнозирования, выводит графики, рассчитывает важность признаков."""
|
13 |
+
self.stock_name = stock_name # Название
|
14 |
+
self.features = ['lag_25', 'lag_34','lag_33','lag_26','lag_32','lag_31','lag_30','lag_29','lag_27','sentiment_neutral', 'lag_28',
|
15 |
+
'sentiment_positive','sentiment_negative', 'month','day'] # Фичи для модели
|
16 |
+
self.model_name = model_name # Название модели
|
17 |
+
|
18 |
+
# Загрузка моделей
|
19 |
+
if model_name == 'NN':
|
20 |
+
self.model = tf.keras.models.load_model(f'models/nn_predict_1day_ver2_{stock_name}.h5', custom_objects={'mae': tf.keras.metrics.MeanAbsoluteError()})
|
21 |
+
|
22 |
+
if model_name == 'LinearRegression':
|
23 |
+
with open(os.path.join('models', f'linear_predict_1day_ver2_{stock_name}.pkl'), 'rb') as f:
|
24 |
+
self.model = pickle.load(f)
|
25 |
+
|
26 |
+
if model_name == 'LGB':
|
27 |
+
with open(os.path.join('models', f'lgb_predict_1day_ver2_{stock_name}.pkl'), 'rb') as f:
|
28 |
+
self.model = pickle.load(f)
|
29 |
+
|
30 |
+
def generate_dataset(self, stock_name, num_day):
|
31 |
+
"""Парсим датасет, добавляем новые row для прогонза"""
|
32 |
+
merged_df, string = GetNewData(stock_name).get_full_data() # Тут мы получаем датасет с новостями и акциями
|
33 |
+
|
34 |
+
# Создаем датасет с дополнительными строками
|
35 |
+
last_date = merged_df['DATE'].max()
|
36 |
+
new_date_rng = pd.date_range(start=last_date + pd.Timedelta(hours=1), periods=24 * num_day , freq='H')
|
37 |
+
|
38 |
+
new_df = pd.DataFrame(new_date_rng, columns=['DATE'])
|
39 |
+
new_df['month'] = new_df['DATE'].dt.month
|
40 |
+
new_df['day'] = new_df['DATE'].dt.day
|
41 |
+
|
42 |
+
for c in self.features:
|
43 |
+
last_value = merged_df[c].values[-24 * num_day :]
|
44 |
+
new_df[c] = last_value
|
45 |
+
|
46 |
+
# Объединяем датасет
|
47 |
+
merged_df_new = pd.concat([merged_df, new_df[self.features+['DATE']]], ignore_index=True)
|
48 |
+
return merged_df_new, string
|
49 |
+
|
50 |
+
def predict(self, num_day):
|
51 |
+
# Создаем прогнозы, важности признаков
|
52 |
+
merged_df, string = self.generate_dataset(self.stock_name, num_day) # Парсим датасет
|
53 |
+
|
54 |
+
if self.model_name == 'NN':
|
55 |
+
importance = np.abs(self.model.layers[0].get_weights()[0])
|
56 |
+
importance = np.mean(importance, axis=1)
|
57 |
+
df_weighted = merged_df[self.features].ffill().bfill()[-num_day*24:] * importance
|
58 |
+
|
59 |
+
if self.model_name == 'LinearRegression':
|
60 |
+
df_weighted = merged_df[self.features].ffill().bfill()[-num_day*24:] * self.model.coef_
|
61 |
+
|
62 |
+
if self.model_name == 'LGB':
|
63 |
+
df_weighted = merged_df[self.features].ffill().bfill()[-num_day*24:] * self.model.feature_importances_
|
64 |
+
|
65 |
+
average_values = df_weighted.mean(axis=0).abs().sort_values(ascending=False)
|
66 |
+
average_values_filtered = average_values.drop('lag_25')
|
67 |
+
|
68 |
+
total_sum = average_values_filtered.sum()
|
69 |
+
average_values_percentage = (average_values_filtered / total_sum) * 100
|
70 |
+
|
71 |
+
string += '\n Самые полезные признаки для прогнозов: \n'
|
72 |
+
|
73 |
+
for f, v in zip(average_values_percentage.index, average_values_percentage.values):
|
74 |
+
string += f'- {f}: важность = {v:.2f}%\n'
|
75 |
+
|
76 |
+
if self.model_name in ['LinearRegression', 'LGB']:
|
77 |
+
return pd.DataFrame({
|
78 |
+
'predict': self.model.predict(merged_df[self.features].ffill().bfill().values),
|
79 |
+
'DATE': merged_df['DATE'].values,
|
80 |
+
'CLOSE': merged_df['CLOSE'].values
|
81 |
+
}), string
|
82 |
+
|
83 |
+
else:
|
84 |
+
return pd.DataFrame({
|
85 |
+
'predict': self.model.predict(merged_df[self.features].ffill().bfill().values)[:, 0],
|
86 |
+
'DATE': merged_df['DATE'].values,
|
87 |
+
'CLOSE': merged_df['CLOSE'].values
|
88 |
+
}), string
|
89 |
+
|
90 |
+
def plot_predict(self, predict, add_smoothing):
|
91 |
+
predict = predict[-24*12:]
|
92 |
+
|
93 |
+
scaling_factor = predict['CLOSE'].mean() / predict['predict'].mean()
|
94 |
+
scaled_preds = predict['predict'] * scaling_factor
|
95 |
+
|
96 |
+
fig = px.line(predict, x=predict.DATE, y='CLOSE', labels={'value': 'Цена'}, title='CLOSE')
|
97 |
+
fig.add_scatter(x=predict.DATE, y=scaled_preds, mode='lines', name='Predict', opacity=0.7)
|
98 |
+
|
99 |
+
if add_smoothing:
|
100 |
+
smoothed_preds = pd.Series(scaled_preds).ewm(3).mean()
|
101 |
+
fig.add_scatter(x=predict.DATE, y=smoothed_preds, mode='lines', name='Сглаженные предсказания', opacity=0.7)
|
102 |
+
|
103 |
+
fig.update_layout(xaxis=dict(type='category'))
|
104 |
+
return fig
|
models/lgb_predict_1day_ver2_AGRO.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:12eac4a543adda6d7e50335fd4c742c75bcb974b662916084057ae3b6080d364
|
3 |
+
size 111059
|
models/lgb_predict_1day_ver2_CHMF.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:363d694e123fa2678c57cc9cc7d2cc08aa18bf6cb8cd27f32504ea3a643a86d6
|
3 |
+
size 108149
|
models/lgb_predict_1day_ver2_MTSS.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0efd081b82763fca72d13ef27090611d07204bf91587d04df28ec2ab5dba813b
|
3 |
+
size 112768
|
models/lgb_predict_1day_ver2_SBER.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6b184feb04c371e6f08071de17c51445a28cee988cff9c9f8b7654ab3911b3b
|
3 |
+
size 112729
|
models/lgb_predict_1day_ver2_SIBN.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdecdfd8f57c50eb3cfefcf85c79b4e560bc26c184cb9b1ad74729fcae40edc1
|
3 |
+
size 112621
|
models/lgb_predict_1day_ver2_SMLT.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fed679e9165076c8b13f80740f5f5783a8f991d091e7ef3f037ccbb970286bc9
|
3 |
+
size 112694
|
models/lgb_predict_1day_ver2_VKCO.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ee5fa66fbd15533f9c7cc5441cde36489754ebbd35228228bf18e677ac9fce3
|
3 |
+
size 113490
|
models/lgb_predict_1day_ver2_YNDX.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d72c9985aa69d15b9d374928fde8da467e479142dddbcd7db420ae485ea82efe
|
3 |
+
size 111507
|
models/linear_predict_1day_ver2_AGRO.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:351f3dab393b18467096c8d7b60328d015a95664ea0a6803eb176687ef4630bc
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_CHMF.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a3d43e65ea82c9a82a55aff31a8912ed96f2130644f0fe699c988ada0445692c
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_MTSS.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:219e726dfdc4c31bc340629a1d8d27511c31770dec679a5a2c3c91ecdc01265c
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_SBER.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d9702e128cf3bcf6f443d0c43edc252ae49c9ed43154359119a73f0c8a9b57d
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_SIBN.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1a4f667e472c79b660aa6699fafeb737f57d298e607a56b665fa4fdc0bb2e6cf
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_SMLT.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:17f202f365d672df86daa3dc506d8f9f1107c5decba08500cee86896e54c37d5
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_VKCO.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:137d9f838962f785be93d826d7aee964dd12d260d7a7b55e1c5d382057c16b05
|
3 |
+
size 963
|
models/linear_predict_1day_ver2_YNDX.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed581f0da796a8f864ecd7a35d82fb9ab18f374ee1cb2fc441206643d2e801dd
|
3 |
+
size 963
|
models/nn_predict_1day_ver2_AGRO.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79ef5af4053c6f8568071ecb1312360a53e5167782b72b013b5ef9365e57a95e
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_CHMF.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9eee0d24f5e43527e7b8b4d8ba0c35ecbb3001c285436a659c1b357182effd1
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_MTSS.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0fd3ff689591aff665292602995a5b5b74effea3a93d2f96560059e28487b80e
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_SBER.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bebb94e7996a989ffb680b359df91d27892faecd20f5a056f0e860c50752d8a7
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_SIBN.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9659fdb354f975c47e3fbc08b7cda9836e11a740b729eb88ccc1661183ff2527
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_SMLT.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:847847ee5298aab687d3543cf0ae8ddca9b0075fdd0e56d31c37ec59dcbde815
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_VKCO.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a6b68dcbabf1775b27d57c018e6ab2b730970a460445fe3ae51039944b7ad96
|
3 |
+
size 23920
|
models/nn_predict_1day_ver2_YNDX.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a229e3e67e112bbbf121b048e3a69a86933584b34f9275d9e3679c1064ea66d9
|
3 |
+
size 23920
|
news.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import pandas as pd
|
3 |
+
import time
|
4 |
+
import pymorphy2
|
5 |
+
import re
|
6 |
+
|
7 |
+
from datetime import datetime, timedelta
|
8 |
+
from transformers import pipeline
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
|
11 |
+
class NewsData:
|
12 |
+
def __init__(self) -> None:
|
13 |
+
"""
|
14 |
+
Парсер статей с ru.investing.com.
|
15 |
+
"""
|
16 |
+
self.urls = [
|
17 |
+
("https://ru.investing.com/news/forex-news/", "forex-news"),
|
18 |
+
("https://ru.investing.com/news/commodities-news/", "commodities-news"),
|
19 |
+
]
|
20 |
+
self.headers = {
|
21 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
|
22 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
23 |
+
'Referer': 'http://google.com'
|
24 |
+
}
|
25 |
+
|
26 |
+
self.morph = pymorphy2.MorphAnalyzer() # С помощь него мы будем приводить слова в начальную форму
|
27 |
+
|
28 |
+
def get_data(self):
|
29 |
+
# Парсим сайты за последнии 7 дней
|
30 |
+
date_limit = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=7*1)
|
31 |
+
articles_data = []
|
32 |
+
|
33 |
+
for base_url, tag in self.urls:
|
34 |
+
page = 1
|
35 |
+
while True:
|
36 |
+
url = f"{base_url}{page}/"
|
37 |
+
response = requests.get(url, headers=self.headers)
|
38 |
+
|
39 |
+
if response.status_code == 200:
|
40 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
41 |
+
articles = soup.find_all("a", attrs={"data-test": "article-title-link"})
|
42 |
+
|
43 |
+
daily_count = 0
|
44 |
+
|
45 |
+
for article in articles:
|
46 |
+
title = article.text.strip()
|
47 |
+
date_tag = article.find_next("time", attrs={"data-test": "article-publish-date"})
|
48 |
+
url = article["href"]
|
49 |
+
|
50 |
+
if date_tag:
|
51 |
+
publish_date_str = date_tag["datetime"]
|
52 |
+
publish_date = datetime.strptime(publish_date_str, "%Y-%m-%d %H:%M:%S")
|
53 |
+
|
54 |
+
articles_data.append([title, publish_date_str, tag, url])
|
55 |
+
daily_count += 1
|
56 |
+
|
57 |
+
page += 1
|
58 |
+
|
59 |
+
if publish_date < date_limit:
|
60 |
+
print(f"Дата публикации: {publish_date_str} меньше лимита. Остановка.")
|
61 |
+
print(f"Общее количество скачанных статей: {len(articles_data)}")
|
62 |
+
break
|
63 |
+
else:
|
64 |
+
if response.status_code == 404:
|
65 |
+
break
|
66 |
+
|
67 |
+
print(f"Ошибка при запросе страницы {page}, код ответа: {response.status_code}")
|
68 |
+
time.sleep(5)
|
69 |
+
|
70 |
+
return self.get_proccesing_news(articles_data)
|
71 |
+
|
72 |
+
def clean_text(self, text):
|
73 |
+
text = re.sub(r'[^а-яА-Я\s]', '', text)
|
74 |
+
text = text.lower()
|
75 |
+
text = ' '.join([word for word in text.split() if len(word) >= 3])
|
76 |
+
text = ' '.join([self.morph.parse(word)[0].normal_form for word in text.split()])
|
77 |
+
return text
|
78 |
+
|
79 |
+
def get_proccesing_news(self, articles_data):
|
80 |
+
# Обрабатываем полученные данные, добавляем сентиментальный анализ.
|
81 |
+
sentiment_model = pipeline("sentiment-analysis", model="mxlcw/rubert-tiny2-russian-financial-sentiment", framework="pt")
|
82 |
+
|
83 |
+
news = pd.DataFrame(articles_data, columns=["title", "DATE", "tags", 'url'])
|
84 |
+
|
85 |
+
news['title'] = news['title'].ffill().map(self.clean_text)
|
86 |
+
news['sentiment'] = news['title'].progress_apply(lambda x: sentiment_model(x)[0]['label'])
|
87 |
+
news = pd.get_dummies(news, columns=['sentiment'])
|
88 |
+
|
89 |
+
news['DATE'] = pd.to_datetime(news['DATE'])
|
90 |
+
news['day'] = news['DATE'].dt.day
|
91 |
+
news['year'] = news['DATE'].dt.year
|
92 |
+
news['month'] = news['DATE'].dt.month
|
93 |
+
news['hour'] = news['DATE'].dt.hour
|
94 |
+
return news
|
stock_and.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import requests
|
3 |
+
import time
|
4 |
+
import apimoex
|
5 |
+
|
6 |
+
from datetime import datetime, timedelta
|
7 |
+
from news import NewsData
|
8 |
+
|
9 |
+
class GetNewData:
|
10 |
+
def __init__(self, stock_name) -> None:
|
11 |
+
"""Этот класс парсит данные"""
|
12 |
+
self.news = NewsData()
|
13 |
+
self.stock_name = stock_name # Название акции
|
14 |
+
|
15 |
+
def downolad_stock(self):
|
16 |
+
# Исправляем название акции, просто акция поменяла свое названия
|
17 |
+
if self.stock_name == 'YNDX':
|
18 |
+
self.stock_name = 'YDEX'
|
19 |
+
|
20 |
+
# Тут мы будем делать до 5 итерацыя, если возниканет ошибка с сервером московской биржы, иногда он просто не находит акции
|
21 |
+
for _ in range(5):
|
22 |
+
try:
|
23 |
+
# Дата с которой будет начинатся наш датасет
|
24 |
+
new_date = datetime.now().replace(minute=0, second=0, microsecond=0) - timedelta(days=24*30)
|
25 |
+
new_date = str(new_date.strftime('%Y-%m-%d %H:%M'))
|
26 |
+
|
27 |
+
# Запросы к московской бирже, и создания датасета
|
28 |
+
with requests.Session() as session:
|
29 |
+
# Получаем данные с биржи
|
30 |
+
market_data = apimoex.get_board_candles(session, security=self.stock_name, market='shares', columns=['close', 'begin', 'volume'],
|
31 |
+
start=new_date, interval=60)
|
32 |
+
|
33 |
+
df = pd.DataFrame(market_data)
|
34 |
+
df.columns = ['CLOSE', 'DATE', 'VOL']
|
35 |
+
|
36 |
+
df['DATE'] = pd.to_datetime(df['DATE'])
|
37 |
+
df['day'] = df['DATE'].dt.day
|
38 |
+
df['year'] = df['DATE'].dt.year
|
39 |
+
df['month'] = df['DATE'].dt.month
|
40 |
+
df['hour'] = df['DATE'].dt.hour
|
41 |
+
|
42 |
+
# Генерим лаги для модели
|
43 |
+
for i in range(1, 10+1):
|
44 |
+
df[f'lag_{i+24}'] = df['CLOSE'].shift(i)
|
45 |
+
|
46 |
+
df['lag_24'] = df['CLOSE']
|
47 |
+
break
|
48 |
+
except:
|
49 |
+
# Если возникнет ошибка, то мы будем ждать 5 секунд и повторять все действия снова
|
50 |
+
time.sleep(5)
|
51 |
+
return df
|
52 |
+
|
53 |
+
def get_full_data(self):
|
54 |
+
"""Функция для получения полного датасета"""
|
55 |
+
articles_data = self.news.get_data() # Получаем данные о новостями
|
56 |
+
stock = self.downolad_stock() # Получаем данные о биржи
|
57 |
+
|
58 |
+
# Обеденяем датасеты
|
59 |
+
news_ = articles_data.drop(columns=['DATE']).groupby(['day', 'month', 'year', 'hour']).agg({
|
60 |
+
'sentiment_negative': 'mean',
|
61 |
+
'sentiment_neutral': 'mean',
|
62 |
+
'sentiment_positive': 'mean',
|
63 |
+
'url': lambda x: list(x),
|
64 |
+
'title': lambda x: list(x)
|
65 |
+
}).reset_index()
|
66 |
+
|
67 |
+
merged_df = pd.merge(stock, news_, how='left', on=['year', 'month', 'day', 'hour'], suffixes=('', '_y')).bfill().ffill()
|
68 |
+
merged_df = merged_df.sort_values(by=['DATE'])
|
69 |
+
|
70 |
+
# Расчитываем важность статей
|
71 |
+
merged_df['super'] = merged_df['sentiment_positive'] + merged_df['sentiment_neutral']
|
72 |
+
|
73 |
+
string = ''
|
74 |
+
string += '\n Самые полезные статьи для прогнозов: \n'
|
75 |
+
|
76 |
+
for url, v, title in merged_df[-72:].sort_values('super', ascending=False)[['url', 'super', 'title']].values[:10]:
|
77 |
+
for u, t in zip(url, title):
|
78 |
+
string += f'- [New: {t}]({u}), важность признака: {v} \n'
|
79 |
+
|
80 |
+
return merged_df, string
|
81 |
+
|