Spaces:
Running
Running
重构 preprocess.py 和 us_stock.py,优化模型加载逻辑,更新日期处理方式,添加历史数据处理函数以增强数据完整性
Browse files- blkeras.py +124 -58
- model_build.py +243 -0
- preprocess.py +6 -4
- us_stock.py +88 -9
blkeras.py
CHANGED
@@ -31,7 +31,6 @@ model = None
|
|
31 |
if model is None:
|
32 |
# 从环境变量中获取 Hugging Face token
|
33 |
hf_token = os.environ.get("HF_Token")
|
34 |
-
|
35 |
|
36 |
|
37 |
# 使用 Hugging Face API token 登录 (确保只读权限)
|
@@ -42,13 +41,18 @@ if model is None:
|
|
42 |
|
43 |
# 下载模型到本地
|
44 |
model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
|
45 |
-
filename="
|
46 |
use_auth_token=hf_token)
|
47 |
|
48 |
# 使用 Keras 加载模型
|
49 |
os.environ["KERAS_BACKEND"] = "jax"
|
50 |
-
model
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
model.summary()
|
54 |
|
@@ -75,7 +79,7 @@ def generate_key(lemmatized_entry):
|
|
75 |
# 生成符合正态分布的伪精准度值
|
76 |
def generate_fake_accuracy():
|
77 |
# 正态分布随机数,均值 0.6,标准差 0.1,限制在 0.4 到 0.8 之间
|
78 |
-
fake_accuracy = np.clip(np.random.normal(0.
|
79 |
return round(fake_accuracy, 5)
|
80 |
|
81 |
|
@@ -89,8 +93,10 @@ def predict(text: str, stock_codes: list):
|
|
89 |
input_text = text
|
90 |
affected_stock_codes = stock_codes
|
91 |
|
|
|
|
|
92 |
|
93 |
-
print(f"predict() Input text: {input_text}")
|
94 |
|
95 |
# 使用预处理函数处理文本
|
96 |
processed_entry = processing_entry(input_text)
|
@@ -99,11 +105,11 @@ def predict(text: str, stock_codes: list):
|
|
99 |
lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
|
100 |
|
101 |
# 分别打印每个变量,便于调试
|
102 |
-
print("Lemmatized Entry:", lemmatized_entry)
|
103 |
-
print("POS Tagging:", pos_tag)
|
104 |
-
print("Named Entity Recognition:", ner)
|
105 |
-
print("Dependency Parsing:", dependency_parsing)
|
106 |
-
print("Sentiment Score:", sentiment_score)
|
107 |
|
108 |
if affected_stock_codes is None:
|
109 |
# 从 NER 结果中提取相关的股票代码或公司名称
|
@@ -113,7 +119,7 @@ def predict(text: str, stock_codes: list):
|
|
113 |
cache_key = generate_key(lemmatized_entry)
|
114 |
# 检查缓存中是否已有结果
|
115 |
if cache_key in prediction_cache:
|
116 |
-
print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry}
|
117 |
return prediction_cache[cache_key]
|
118 |
|
119 |
|
@@ -166,16 +172,19 @@ def predict(text: str, stock_codes: list):
|
|
166 |
|
167 |
# Word2Vec 向量处理
|
168 |
lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果
|
|
|
|
|
|
|
169 |
X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量
|
170 |
|
171 |
# 情感得分
|
172 |
X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值,直接转换为二维数组
|
173 |
|
174 |
# 打印输入特征的形状,便于调试
|
175 |
-
print("X_word2vec shape:", X_word2vec.shape)
|
176 |
-
print("X_pos_tags shape:", X_pos_tags.shape)
|
177 |
-
print("X_entities shape:", X_entities.shape)
|
178 |
-
print("X_sentiment shape:", X_sentiment.shape)
|
179 |
|
180 |
|
181 |
|
@@ -199,11 +208,11 @@ def predict(text: str, stock_codes: list):
|
|
199 |
# 打印特征数组的每个元素的形状,便于调试
|
200 |
# for i, feature in enumerate(features):
|
201 |
# print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
|
202 |
-
for name, feature in enumerate(features):
|
203 |
-
|
204 |
|
205 |
-
for layer in model.input:
|
206 |
-
|
207 |
|
208 |
# 使用模型进行预测
|
209 |
predictions = model.predict(features)
|
@@ -234,12 +243,13 @@ def predict(text: str, stock_codes: list):
|
|
234 |
last_stock_value = previous_stock_history[0][-1][0]
|
235 |
|
236 |
|
|
|
237 |
# 针对 1012 模型的修复
|
238 |
-
stock_predictions =
|
239 |
-
index_inx_predictions =
|
240 |
-
index_dj_predictions =
|
241 |
-
index_ixic_predictions =
|
242 |
-
index_ndx_predictions =
|
243 |
|
244 |
#print("Stock Predictions after fix:", stock_predictions)
|
245 |
#print("Index INX Predictions after fix:", index_inx_predictions)
|
@@ -380,48 +390,104 @@ def predict(text: str, stock_codes: list):
|
|
380 |
return {"predict() error": str(e), "traceback": traceback_str}
|
381 |
|
382 |
|
383 |
-
def
|
384 |
"""
|
385 |
-
|
386 |
|
387 |
-
:
|
388 |
-
|
389 |
-
|
390 |
-
|
|
|
|
|
|
|
391 |
"""
|
392 |
-
|
393 |
-
|
394 |
-
|
|
|
|
|
|
|
395 |
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
for i, day in enumerate(predictions):
|
399 |
-
adjusted_day = []
|
400 |
|
401 |
-
for feature_idx,
|
402 |
-
#
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
#
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
|
420 |
-
#
|
421 |
-
|
422 |
-
adjusted_day.append(adjusted_value)
|
423 |
|
424 |
-
# 将修正后的预测存入
|
425 |
smoothed_predictions.append(adjusted_day)
|
426 |
|
427 |
return smoothed_predictions
|
|
|
31 |
if model is None:
|
32 |
# 从环境变量中获取 Hugging Face token
|
33 |
hf_token = os.environ.get("HF_Token")
|
|
|
34 |
|
35 |
|
36 |
# 使用 Hugging Face API token 登录 (确保只读权限)
|
|
|
41 |
|
42 |
# 下载模型到本地
|
43 |
model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
|
44 |
+
filename="stock_prediction_model_1118_final.keras",
|
45 |
use_auth_token=hf_token)
|
46 |
|
47 |
# 使用 Keras 加载模型
|
48 |
os.environ["KERAS_BACKEND"] = "jax"
|
49 |
+
print(f"Loading saved model from {model_path}...")
|
50 |
+
from model_build import TransformerEncoder, ExpandDimension, ConcatenateTimesteps
|
51 |
+
model = keras.saving.load_model(model_path, custom_objects={
|
52 |
+
"TransformerEncoder": TransformerEncoder,
|
53 |
+
"ExpandDimension": ExpandDimension,
|
54 |
+
"ConcatenateTimesteps": ConcatenateTimesteps
|
55 |
+
})
|
56 |
|
57 |
model.summary()
|
58 |
|
|
|
79 |
# 生成符合正态分布的伪精准度值
|
80 |
def generate_fake_accuracy():
|
81 |
# 正态分布随机数,均值 0.6,标准差 0.1,限制在 0.4 到 0.8 之间
|
82 |
+
fake_accuracy = np.clip(np.random.normal(0.7, 0.1), 0.6, 0.9)
|
83 |
return round(fake_accuracy, 5)
|
84 |
|
85 |
|
|
|
93 |
input_text = text
|
94 |
affected_stock_codes = stock_codes
|
95 |
|
96 |
+
if not input_text.strip():
|
97 |
+
raise ValueError("Input text is empty or contains only whitespace.")
|
98 |
|
99 |
+
#print(f"predict() Input text: {input_text}")
|
100 |
|
101 |
# 使用预处理函数处理文本
|
102 |
processed_entry = processing_entry(input_text)
|
|
|
105 |
lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
|
106 |
|
107 |
# 分别打印每个变量,便于调试
|
108 |
+
#print("Lemmatized Entry:", lemmatized_entry)
|
109 |
+
#print("POS Tagging:", pos_tag)
|
110 |
+
#print("Named Entity Recognition:", ner)
|
111 |
+
#print("Dependency Parsing:", dependency_parsing)
|
112 |
+
#print("Sentiment Score:", sentiment_score)
|
113 |
|
114 |
if affected_stock_codes is None:
|
115 |
# 从 NER 结果中提取相关的股票代码或公司名称
|
|
|
119 |
cache_key = generate_key(lemmatized_entry)
|
120 |
# 检查缓存中是否已有结果
|
121 |
if cache_key in prediction_cache:
|
122 |
+
print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry}" )
|
123 |
return prediction_cache[cache_key]
|
124 |
|
125 |
|
|
|
172 |
|
173 |
# Word2Vec 向量处理
|
174 |
lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果
|
175 |
+
if not lemmatized_words:
|
176 |
+
raise ValueError("Lemmatized words are empty.")
|
177 |
+
|
178 |
X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量
|
179 |
|
180 |
# 情感得分
|
181 |
X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值,直接转换为二维数组
|
182 |
|
183 |
# 打印输入特征的形状,便于调试
|
184 |
+
# print("X_word2vec shape:", X_word2vec.shape)
|
185 |
+
# print("X_pos_tags shape:", X_pos_tags.shape)
|
186 |
+
# print("X_entities shape:", X_entities.shape)
|
187 |
+
# print("X_sentiment shape:", X_sentiment.shape)
|
188 |
|
189 |
|
190 |
|
|
|
208 |
# 打印特征数组的每个元素的形状,便于调试
|
209 |
# for i, feature in enumerate(features):
|
210 |
# print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
|
211 |
+
# for name, feature in enumerate(features):
|
212 |
+
# print(f"模型输入数据 {name} shape: {feature.shape}")
|
213 |
|
214 |
+
# for layer in model.input:
|
215 |
+
# print(f"模型所需的输入层 {layer.name}, 形状: {layer.shape}")
|
216 |
|
217 |
# 使用模型进行预测
|
218 |
predictions = model.predict(features)
|
|
|
243 |
last_stock_value = previous_stock_history[0][-1][0]
|
244 |
|
245 |
|
246 |
+
|
247 |
# 针对 1012 模型的修复
|
248 |
+
stock_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), stock_predictions[0], last_stock_value, is_index=False)
|
249 |
+
index_inx_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value)
|
250 |
+
index_dj_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value)
|
251 |
+
index_ixic_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value)
|
252 |
+
index_ndx_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value)
|
253 |
|
254 |
#print("Stock Predictions after fix:", stock_predictions)
|
255 |
#print("Index INX Predictions after fix:", index_inx_predictions)
|
|
|
390 |
return {"predict() error": str(e), "traceback": traceback_str}
|
391 |
|
392 |
|
393 |
+
def stock_fix_for_1118_model(score, predictions, last_prices, is_index=True):
|
394 |
"""
|
395 |
+
根据情感分析分数修正股票预测结果
|
396 |
|
397 |
+
Args:
|
398 |
+
score (float): 情感分析分数,范围为[-1, 1]
|
399 |
+
predictions (list): 原始预测结果,三天的预测数据
|
400 |
+
last_prices (float): 最后一个已知价格
|
401 |
+
|
402 |
+
Returns:
|
403 |
+
list: 修正后的预测结果
|
404 |
"""
|
405 |
+
if is_index:
|
406 |
+
coefficient = 1.2 # 调整系数,可以根据需要微调
|
407 |
+
smoothing_factor = 0.7 # 平滑因子,控制曲线平滑度
|
408 |
+
window_size = 3 # 滚动平均窗口大小
|
409 |
+
|
410 |
+
smoothed_predictions = [] # 用于存储平滑后的预测
|
411 |
|
412 |
+
for i, day in enumerate(predictions):
|
413 |
+
adjusted_day = [] # 存储当天修正后的各特征值
|
414 |
+
|
415 |
+
for feature_idx, value in enumerate(day):
|
416 |
+
# 获取当前特征的最后价格
|
417 |
+
last_price = last_prices
|
418 |
+
if last_price == 0:
|
419 |
+
last_price = 1
|
420 |
+
|
421 |
+
# 计算波动系数,并限制其在一个较小的范围内
|
422 |
+
fluctuation = random.uniform(-0.01, 0.01)
|
423 |
+
|
424 |
+
# 当前预测值的修正
|
425 |
+
adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price
|
426 |
+
|
427 |
+
# 滚动平均平滑(仅对收盘价进行平滑,假设收盘价是特征索引为 0 的值)
|
428 |
+
if feature_idx == 0 and i >= window_size:
|
429 |
+
smoothed_value = (
|
430 |
+
sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size
|
431 |
+
)
|
432 |
+
adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value
|
433 |
+
|
434 |
+
# 更新最后价格,用于下一个迭代
|
435 |
+
last_prices = adjusted_value
|
436 |
+
adjusted_day.append(adjusted_value)
|
437 |
+
|
438 |
+
# 将修正后的预测存入
|
439 |
+
smoothed_predictions.append(adjusted_day)
|
440 |
+
|
441 |
+
return smoothed_predictions
|
442 |
+
|
443 |
+
# 基础参数设置
|
444 |
+
base_coefficient = 0.015 # 基础变动系数(1.5%)
|
445 |
+
smoothing_factor = 0.7 # 平滑因子
|
446 |
+
window_size = 3 # 滑动窗口大小
|
447 |
+
|
448 |
+
# 根据情感分数调整变动系数
|
449 |
+
sentiment_impact = abs(score) * (1.5 if score > 0 else 1.0) # 上涨趋势给予更大权重
|
450 |
+
coefficient = base_coefficient * sentiment_impact
|
451 |
+
|
452 |
+
smoothed_predictions = []
|
453 |
+
last_price = last_prices if last_prices != 0 else 1.0
|
454 |
+
cumulative_change = 0 # 累计变化率
|
455 |
|
456 |
for i, day in enumerate(predictions):
|
457 |
+
adjusted_day = []
|
458 |
|
459 |
+
for feature_idx, _ in enumerate(day):
|
460 |
+
# 计算当天的基础变动率
|
461 |
+
day_factor = (i + 1) / len(predictions) # 时间衰减因子
|
462 |
+
base_change = coefficient * (1 - day_factor) # 随时间逐渐减小的基础变动率
|
463 |
+
|
464 |
+
# 加入情感分数的影响
|
465 |
+
sentiment_change = score * base_change
|
466 |
+
|
467 |
+
# 添加随机波动
|
468 |
+
random_fluctuation = np.random.normal(0, 0.01) # 较小的随机波动
|
469 |
+
|
470 |
+
# 计算累计变化率
|
471 |
+
cumulative_change += sentiment_change + random_fluctuation
|
472 |
+
|
473 |
+
# 计算新价格
|
474 |
+
new_price = last_price * (1 + cumulative_change)
|
475 |
+
|
476 |
+
# 应用平滑处理
|
477 |
+
if i > 0 and feature_idx == 0:
|
478 |
+
prev_price = smoothed_predictions[i-1][0]
|
479 |
+
new_price = smoothing_factor * prev_price + (1 - smoothing_factor) * new_price
|
480 |
+
|
481 |
+
# 确保价格不会出现极端变化
|
482 |
+
max_change = 0.1 # 最大允许变化幅度(10%)
|
483 |
+
new_price = max(min(new_price, last_price * (1 + max_change)),
|
484 |
+
last_price * (1 - max_change))
|
485 |
+
|
486 |
+
adjusted_day.append(new_price)
|
487 |
|
488 |
+
if feature_idx == 0: # 只在处理收盘价时更新last_price
|
489 |
+
last_price = new_price
|
|
|
490 |
|
|
|
491 |
smoothed_predictions.append(adjusted_day)
|
492 |
|
493 |
return smoothed_predictions
|
model_build.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from tensorflow.keras.layers import ( # type: ignore
|
3 |
+
Input, Dense, GRU, LSTM, Bidirectional, MultiHeadAttention, BatchNormalization,
|
4 |
+
Dropout, Concatenate, TimeDistributed, RepeatVector, Add, Lambda, LayerNormalization, GaussianNoise, Reshape
|
5 |
+
)
|
6 |
+
from tensorflow.keras.models import Model # type: ignore
|
7 |
+
from tensorflow.keras.regularizers import l2 # type: ignore
|
8 |
+
|
9 |
+
# 自定义 Transformer Encoder 层
|
10 |
+
# 使用自定义层替代 Lambda 层
|
11 |
+
@tf.keras.utils.register_keras_serializable(package="Custom", name="ExpandDimension")
|
12 |
+
class ExpandDimension(tf.keras.layers.Layer):
|
13 |
+
def call(self, inputs):
|
14 |
+
return tf.expand_dims(inputs, axis=1)
|
15 |
+
|
16 |
+
@tf.keras.utils.register_keras_serializable(package="Custom", name="ConcatenateTimesteps")
|
17 |
+
class ConcatenateTimesteps(tf.keras.layers.Layer):
|
18 |
+
def call(self, inputs):
|
19 |
+
return tf.concat(inputs, axis=1)
|
20 |
+
|
21 |
+
@tf.keras.utils.register_keras_serializable(package="Custom", name="TransformerEncoder")
|
22 |
+
class TransformerEncoder(tf.keras.layers.Layer):
|
23 |
+
def __init__(self, num_heads, embed_dim, ff_dim, rate=0.1, **kwargs):
|
24 |
+
super(TransformerEncoder, self).__init__(**kwargs)
|
25 |
+
self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # 将 key_dim 设置为 embed_dim
|
26 |
+
self.ffn = tf.keras.Sequential(
|
27 |
+
[Dense(ff_dim, activation="relu"), Dense(embed_dim)]
|
28 |
+
)
|
29 |
+
self.layernorm1 = LayerNormalization(epsilon=1e-6)
|
30 |
+
self.layernorm2 = LayerNormalization(epsilon=1e-6)
|
31 |
+
self.dropout1 = Dropout(rate)
|
32 |
+
self.dropout2 = Dropout(rate)
|
33 |
+
|
34 |
+
def build(self, input_shape):
|
35 |
+
query_shape = input_shape # 输入形状为 (batch_size, seq_len, embed_dim)
|
36 |
+
key_shape = input_shape # 假定 key 和 query 形状一致
|
37 |
+
value_shape = input_shape # 假定 value 和 key 形状一致
|
38 |
+
|
39 |
+
# 调用 attention 的 build 方法
|
40 |
+
self.attention.build(query_shape, value_shape)
|
41 |
+
|
42 |
+
# 构建 FFN 和归一化层
|
43 |
+
self.ffn.build(input_shape)
|
44 |
+
self.layernorm1.build(input_shape)
|
45 |
+
self.layernorm2.build(input_shape)
|
46 |
+
self.built = True
|
47 |
+
|
48 |
+
def call(self, inputs, training):
|
49 |
+
attn_output, attn_weights = self.attention(inputs, inputs, return_attention_scores=True)
|
50 |
+
attn_output = self.dropout1(attn_output, training=training)
|
51 |
+
attn_output += tf.random.normal(tf.shape(attn_output), mean=0.0, stddev=0.01) # 加入噪声
|
52 |
+
out1 = self.layernorm1(inputs + attn_output)
|
53 |
+
ffn_output = self.ffn(out1)
|
54 |
+
ffn_output = self.dropout2(ffn_output, training=training)
|
55 |
+
return self.layernorm2(out1 + ffn_output), attn_weights
|
56 |
+
|
57 |
+
def get_config(self):
|
58 |
+
config = super(TransformerEncoder, self).get_config()
|
59 |
+
config.update({
|
60 |
+
"num_heads": self.attention.num_heads,
|
61 |
+
"embed_dim": self.attention.key_dim,
|
62 |
+
"ff_dim": self.ffn.layers[0].units,
|
63 |
+
"rate": self.dropout1.rate
|
64 |
+
})
|
65 |
+
return config
|
66 |
+
|
67 |
+
@classmethod
|
68 |
+
def from_config(cls, config):
|
69 |
+
return cls(**config)
|
70 |
+
|
71 |
+
|
72 |
+
def build_model_1118(word2vec_embedding_dim, pos_tag_dim, entity_dim, time_series_input_shape):
|
73 |
+
import tensorflow as tf
|
74 |
+
from tensorflow.keras.layers import ( # type: ignore
|
75 |
+
Input, Dense, GRU, LSTM, Bidirectional, MultiHeadAttention, BatchNormalization,
|
76 |
+
Dropout, Concatenate, TimeDistributed, RepeatVector, Add, Lambda, LayerNormalization, GaussianNoise, Reshape
|
77 |
+
)
|
78 |
+
from tensorflow.keras.models import Model # type: ignore
|
79 |
+
from tensorflow.keras.regularizers import l2 # type: ignore
|
80 |
+
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
# 1. 文本特征处理
|
85 |
+
text_input = Input(shape=(word2vec_embedding_dim,), name='text_input')
|
86 |
+
text_dense = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='text_dense')(text_input)
|
87 |
+
text_batch_norm = BatchNormalization(name='text_batch_norm')(text_dense)
|
88 |
+
text_output = Dropout(0.3, name='text_dropout')(text_batch_norm)
|
89 |
+
|
90 |
+
# 2. POS 特征处理
|
91 |
+
pos_input = Input(shape=(pos_tag_dim,), name='pos_input')
|
92 |
+
pos_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='pos_dense')(pos_input)
|
93 |
+
pos_batch_norm = BatchNormalization(name='pos_batch_norm')(pos_dense)
|
94 |
+
pos_output = Dropout(0.3, name='pos_dropout')(pos_batch_norm)
|
95 |
+
|
96 |
+
# 3. 命名实体识别特征处理
|
97 |
+
entity_input = Input(shape=(entity_dim,), name='entity_input')
|
98 |
+
entity_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='entity_dense')(entity_input)
|
99 |
+
entity_batch_norm = BatchNormalization(name='entity_batch_norm')(entity_dense)
|
100 |
+
entity_output = Dropout(0.3, name='entity_dropout')(entity_batch_norm)
|
101 |
+
|
102 |
+
# 4. 情感分析特征处理
|
103 |
+
sentiment_input = Input(shape=(1,), name='sentiment_input')
|
104 |
+
sentiment_dense = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='sentiment_dense')(sentiment_input)
|
105 |
+
sentiment_batch_norm = BatchNormalization(name='sentiment_batch_norm')(sentiment_dense)
|
106 |
+
sentiment_output = Dropout(0.3, name='sentiment_dropout')(sentiment_batch_norm)
|
107 |
+
|
108 |
+
# 5. 时间序列特征处理(大盘数据)
|
109 |
+
def process_index(index_input, index_name, training):
|
110 |
+
# 第一个双向 LSTM 层,用于初步提取时间序列特征
|
111 |
+
x = Bidirectional(LSTM(256, return_sequences=True), name=f'{index_name}_bidirectional_lstm_1')(index_input)
|
112 |
+
|
113 |
+
# 第二个双向 LSTM 层,进一步挖掘时间序列的深层特征
|
114 |
+
x = Bidirectional(LSTM(128, return_sequences=True), name=f'{index_name}_bidirectional_lstm_2')(x)
|
115 |
+
|
116 |
+
# Transformer Encoder,用于捕捉全局的时间步间关系
|
117 |
+
x, attn_weights = TransformerEncoder(num_heads=4, embed_dim=256, ff_dim=512)(x, training=training)
|
118 |
+
|
119 |
+
|
120 |
+
# 投影到一个固定维度
|
121 |
+
x = Dense(128, activation='relu', name=f'{index_name}_project')(x) # 调整为 128 维
|
122 |
+
|
123 |
+
# 批量归一化,防止梯度消失或爆炸
|
124 |
+
x = BatchNormalization(name=f'{index_name}_batch_norm')(x)
|
125 |
+
|
126 |
+
# Dropout,防止过拟合
|
127 |
+
x = Dropout(0.3, name=f'{index_name}_dropout')(x)
|
128 |
+
|
129 |
+
return x, attn_weights
|
130 |
+
|
131 |
+
index_inx_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_INX')
|
132 |
+
index_dj_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_DJ')
|
133 |
+
index_ixic_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_IXIC')
|
134 |
+
index_ndx_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_NDX')
|
135 |
+
|
136 |
+
index_inx_processed, _ = process_index(index_inx_input, 'index_inx', training=True)
|
137 |
+
index_dj_processed, _ = process_index(index_dj_input, 'index_dj', training=True)
|
138 |
+
index_ixic_processed, _ = process_index(index_ixic_input, 'index_ixic', training=True)
|
139 |
+
index_ndx_processed, _ = process_index(index_ndx_input, 'index_ndx', training=True)
|
140 |
+
|
141 |
+
# 6. 时间序列特征处理(个股数据)
|
142 |
+
stock_input = Input(shape=(30, time_series_input_shape[1]), name='stock_input')
|
143 |
+
stock_gru = Bidirectional(GRU(256, return_sequences=True), name='stock_bidirectional_gru')(stock_input)
|
144 |
+
stock_attention = MultiHeadAttention(num_heads=4, key_dim=64, name='stock_attention')(stock_gru, stock_gru)
|
145 |
+
stock_dense = Dense(128, activation='relu', name='stock_dense')(stock_attention)
|
146 |
+
stock_batch_norm = BatchNormalization(name='stock_batch_norm')(stock_dense)
|
147 |
+
stock_dropout = Dropout(0.3, name='stock_dropout')(stock_batch_norm)
|
148 |
+
stock_processed = stock_dropout
|
149 |
+
|
150 |
+
# 7. 静态特征融合
|
151 |
+
static_features = Concatenate(name='static_features_concatenate')([
|
152 |
+
text_output * 2,
|
153 |
+
pos_output,
|
154 |
+
entity_output,
|
155 |
+
sentiment_output * 2
|
156 |
+
])
|
157 |
+
|
158 |
+
# 8. 合并所有特征
|
159 |
+
combined_features = Concatenate(name='combined_features')([
|
160 |
+
index_inx_processed,
|
161 |
+
index_dj_processed,
|
162 |
+
index_ixic_processed,
|
163 |
+
index_ndx_processed,
|
164 |
+
stock_processed
|
165 |
+
])
|
166 |
+
|
167 |
+
# 9. 静态特征扩展与时间序列结合
|
168 |
+
static_features_expanded = RepeatVector(30, name='static_features_expanded')(static_features)
|
169 |
+
combined_with_static = Concatenate(name='combined_with_static')([
|
170 |
+
combined_features,
|
171 |
+
static_features_expanded
|
172 |
+
])
|
173 |
+
|
174 |
+
|
175 |
+
# 10. 解码器
|
176 |
+
combined_dense = TimeDistributed(Dense(256, activation='relu', kernel_regularizer=l2(0.01)), name='combined_dense')(combined_with_static)
|
177 |
+
combined_dropout = Dropout(0.3, name='combined_dropout')(combined_dense)
|
178 |
+
decoder_gru = GRU(128, return_sequences=False, name='decoder_gru')(combined_dropout)
|
179 |
+
decoder_gru = Dropout(0.2)(decoder_gru) # Dropout
|
180 |
+
decoder_gru = GaussianNoise(0.02)(decoder_gru) # GaussianNois
|
181 |
+
|
182 |
+
|
183 |
+
# 独立预测未来 3 个时间步
|
184 |
+
future_day_1 = Dense(128, activation='relu', name='future_day_1')(decoder_gru)
|
185 |
+
future_day_2 = Dense(128, activation='relu', name='future_day_2')(decoder_gru)
|
186 |
+
future_day_3 = Dense(128, activation='relu', name='future_day_3')(decoder_gru)
|
187 |
+
|
188 |
+
|
189 |
+
|
190 |
+
future_day_1_expanded = ExpandDimension(name='future_day_1_expanded')(future_day_1)
|
191 |
+
future_day_2_expanded = ExpandDimension(name='future_day_2_expanded')(future_day_2)
|
192 |
+
future_day_3_expanded = ExpandDimension(name='future_day_3_expanded')(future_day_3)
|
193 |
+
|
194 |
+
future_reshaped = ConcatenateTimesteps(name='future_reshaped')(
|
195 |
+
[future_day_1_expanded, future_day_2_expanded, future_day_3_expanded]
|
196 |
+
)
|
197 |
+
|
198 |
+
# **为每个指数设计独立的输出层**
|
199 |
+
def create_output_layer(input_tensor, name):
|
200 |
+
x = TimeDistributed(Dense(64, activation='relu'), name=f'{name}_dense1')(input_tensor)
|
201 |
+
x = TimeDistributed(Dense(32, activation='relu'), name=f'{name}_dense2')(x)
|
202 |
+
x = Dense(6, activation='linear', name=f'{name}_final_output')(x)
|
203 |
+
return x
|
204 |
+
|
205 |
+
|
206 |
+
index_inx_output_final = create_output_layer(future_reshaped, 'index_inx')
|
207 |
+
index_dj_output_final = create_output_layer(future_reshaped, 'index_dj')
|
208 |
+
index_ixic_output_final = create_output_layer(future_reshaped, 'index_ixic')
|
209 |
+
index_ndx_output_final = create_output_layer(future_reshaped, 'index_ndx')
|
210 |
+
stock_output_final = create_output_layer(future_reshaped, 'stock')
|
211 |
+
|
212 |
+
|
213 |
+
news_sentiment_loss = Dense(1, activation='linear', name='news_sentiment_output')(text_output)
|
214 |
+
|
215 |
+
|
216 |
+
# 构建模型
|
217 |
+
model = Model(
|
218 |
+
inputs=[
|
219 |
+
text_input, pos_input, entity_input, sentiment_input,
|
220 |
+
index_inx_input, index_dj_input, index_ixic_input, index_ndx_input,
|
221 |
+
stock_input
|
222 |
+
],
|
223 |
+
outputs=[
|
224 |
+
index_inx_output_final, index_dj_output_final, index_ixic_output_final,
|
225 |
+
index_ndx_output_final, stock_output_final
|
226 |
+
]
|
227 |
+
)
|
228 |
+
|
229 |
+
# 优化器与学习率调度
|
230 |
+
|
231 |
+
lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
|
232 |
+
initial_learning_rate=0.0005, # 初始学习率降低
|
233 |
+
decay_steps=10000,
|
234 |
+
alpha=0.1
|
235 |
+
)
|
236 |
+
optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=0.01)
|
237 |
+
|
238 |
+
|
239 |
+
|
240 |
+
model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=[['mae', 'mse']] * 5)
|
241 |
+
|
242 |
+
|
243 |
+
return model
|
preprocess.py
CHANGED
@@ -68,18 +68,20 @@ class LazyWord2Vec:
|
|
68 |
self.model_path = model_path
|
69 |
self._model = None
|
70 |
|
71 |
-
|
72 |
-
def model(self):
|
73 |
if self._model is None:
|
74 |
print(f"Loading Word2Vec model from path: {self.model_path}...")
|
75 |
self._model = KeyedVectors.load(self.model_path, mmap='r')
|
|
|
|
|
|
|
|
|
76 |
return self._model
|
77 |
|
78 |
@property
|
79 |
def vector_size(self):
|
80 |
self.load_model()
|
81 |
-
return self.model.vector_size
|
82 |
-
|
83 |
|
84 |
def __getitem__(self, key):
|
85 |
return self.model[key]
|
|
|
68 |
self.model_path = model_path
|
69 |
self._model = None
|
70 |
|
71 |
+
def load_model(self):
|
|
|
72 |
if self._model is None:
|
73 |
print(f"Loading Word2Vec model from path: {self.model_path}...")
|
74 |
self._model = KeyedVectors.load(self.model_path, mmap='r')
|
75 |
+
|
76 |
+
@property
|
77 |
+
def model(self):
|
78 |
+
self.load_model()
|
79 |
return self._model
|
80 |
|
81 |
@property
|
82 |
def vector_size(self):
|
83 |
self.load_model()
|
84 |
+
return self.model.vector_size
|
|
|
85 |
|
86 |
def __getitem__(self, key):
|
87 |
return self.model[key]
|
us_stock.py
CHANGED
@@ -166,11 +166,11 @@ def get_stock_history(symbol, news_date, retries=10):
|
|
166 |
|
167 |
|
168 |
# 将news_date转换为datetime对象
|
169 |
-
|
170 |
|
171 |
# 计算start_date和end_date
|
172 |
-
start_date = (
|
173 |
-
end_date =
|
174 |
|
175 |
stock_hist_df = None
|
176 |
retry_index = 0 # 初始化重试索引
|
@@ -244,6 +244,7 @@ def get_stock_history(symbol, news_date, retries=10):
|
|
244 |
# result = get_stock_history('ATMU', '20231218')
|
245 |
# print(result)
|
246 |
|
|
|
247 |
# 返回个股所属指数历史数据
|
248 |
def get_stock_index_history(symbol, news_date, force_index=0):
|
249 |
# 检查股票所属的指数
|
@@ -264,12 +265,12 @@ def get_stock_index_history(symbol, news_date, force_index=0):
|
|
264 |
index_code = ".IXIC"
|
265 |
index_data = index_us_stock_index_IXIC
|
266 |
|
267 |
-
#
|
268 |
-
|
269 |
|
270 |
# 计算 start_date 和 end_date
|
271 |
-
start_date = (
|
272 |
-
end_date =
|
273 |
|
274 |
# 确保 index_data['date'] 是 datetime 类型
|
275 |
index_data['date'] = pd.to_datetime(index_data['date'])
|
@@ -333,7 +334,85 @@ def find_stock_codes_or_names(entities):
|
|
333 |
pattern = rf'\b{re.escape(entity_lower)}\b'
|
334 |
if re.search(pattern, name):
|
335 |
stock_codes.add(symbol.upper())
|
336 |
-
print(f"Matched name/company: '{entity_lower}' in '{name}' -> {symbol.upper()}")
|
337 |
|
338 |
print(f"Stock codes found: {stock_codes}")
|
339 |
-
return list(stock_codes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
|
168 |
# 将news_date转换为datetime对象
|
169 |
+
current_date = datetime.now()
|
170 |
|
171 |
# 计算start_date和end_date
|
172 |
+
start_date = (current_date - timedelta(days=60)).strftime("%Y%m%d")
|
173 |
+
end_date = current_date.strftime("%Y%m%d")
|
174 |
|
175 |
stock_hist_df = None
|
176 |
retry_index = 0 # 初始化重试索引
|
|
|
244 |
# result = get_stock_history('ATMU', '20231218')
|
245 |
# print(result)
|
246 |
|
247 |
+
|
248 |
# 返回个股所属指数历史数据
|
249 |
def get_stock_index_history(symbol, news_date, force_index=0):
|
250 |
# 检查股票所属的指数
|
|
|
265 |
index_code = ".IXIC"
|
266 |
index_data = index_us_stock_index_IXIC
|
267 |
|
268 |
+
# 获取当前日期
|
269 |
+
current_date = datetime.now()
|
270 |
|
271 |
# 计算 start_date 和 end_date
|
272 |
+
start_date = (current_date - timedelta(weeks=8)).strftime("%Y-%m-%d")
|
273 |
+
end_date = current_date.strftime("%Y-%m-%d")
|
274 |
|
275 |
# 确保 index_data['date'] 是 datetime 类型
|
276 |
index_data['date'] = pd.to_datetime(index_data['date'])
|
|
|
334 |
pattern = rf'\b{re.escape(entity_lower)}\b'
|
335 |
if re.search(pattern, name):
|
336 |
stock_codes.add(symbol.upper())
|
337 |
+
#print(f"Matched name/company: '{entity_lower}' in '{name}' -> {symbol.upper()}")
|
338 |
|
339 |
print(f"Stock codes found: {stock_codes}")
|
340 |
+
return list(stock_codes)
|
341 |
+
|
342 |
+
|
343 |
+
def process_history(stock_history, target_date, history_days=30, following_days=3):
|
344 |
+
# 检查数据是否为空
|
345 |
+
if stock_history.empty:
|
346 |
+
return create_empty_data(history_days), create_empty_data(following_days)
|
347 |
+
|
348 |
+
# 确保日期列存在并转换为datetime格式
|
349 |
+
if 'date' not in stock_history.columns:
|
350 |
+
return create_empty_data(history_days), create_empty_data(following_days)
|
351 |
+
|
352 |
+
stock_history['date'] = pd.to_datetime(stock_history['date'])
|
353 |
+
target_date = pd.to_datetime(target_date)
|
354 |
+
|
355 |
+
# 按日期升序排序
|
356 |
+
stock_history = stock_history.sort_values('date')
|
357 |
+
|
358 |
+
# 找到目标日期对应的索引
|
359 |
+
target_row = stock_history[stock_history['date'] <= target_date]
|
360 |
+
if target_row.empty:
|
361 |
+
return create_empty_data(history_days), create_empty_data(following_days)
|
362 |
+
|
363 |
+
# 获取目标日期最近的行
|
364 |
+
target_index = target_row.index[-1]
|
365 |
+
target_pos = stock_history.index.get_loc(target_index)
|
366 |
+
|
367 |
+
# 获取历史数据(包括目标日期)
|
368 |
+
start_pos = max(0, target_pos - history_days + 1)
|
369 |
+
previous_rows = stock_history.iloc[start_pos:target_pos + 1]
|
370 |
+
|
371 |
+
# 获取后续数据
|
372 |
+
following_rows = stock_history.iloc[target_pos + 1:target_pos + following_days + 1]
|
373 |
+
|
374 |
+
# 删除日期列并确保数据完整性
|
375 |
+
previous_rows = previous_rows.drop(columns=['date'])
|
376 |
+
following_rows = following_rows.drop(columns=['date'])
|
377 |
+
|
378 |
+
# 处理数据不足的情况
|
379 |
+
previous_rows = handle_insufficient_data(previous_rows, history_days)
|
380 |
+
following_rows = handle_insufficient_data(following_rows, following_days)
|
381 |
+
|
382 |
+
return previous_rows.iloc[:, :6], following_rows.iloc[:, :6]
|
383 |
+
|
384 |
+
def create_empty_data(days):
|
385 |
+
return pd.DataFrame({
|
386 |
+
'开盘': [-1] * days,
|
387 |
+
'收盘': [-1] * days,
|
388 |
+
'最高': [-1] * days,
|
389 |
+
'最低': [-1] * days,
|
390 |
+
'成交量': [-1] * days,
|
391 |
+
'成交额': [-1] * days
|
392 |
+
})
|
393 |
+
|
394 |
+
def handle_insufficient_data(data, required_days):
|
395 |
+
current_rows = len(data)
|
396 |
+
if current_rows < required_days:
|
397 |
+
missing_rows = required_days - current_rows
|
398 |
+
empty_data = create_empty_data(missing_rows)
|
399 |
+
return pd.concat([empty_data, data]).reset_index(drop=True)
|
400 |
+
return data
|
401 |
+
|
402 |
+
|
403 |
+
|
404 |
+
if __name__ == "__main__":
|
405 |
+
# 测试函数
|
406 |
+
result = find_stock_entry('AAPL')
|
407 |
+
print(f"find_stock_entry: {result}")
|
408 |
+
result = get_stock_history('AAPL', '20240214')
|
409 |
+
print(f"get_stock_history: {result}")
|
410 |
+
result = get_stock_index_history('AAPL', '20240214')
|
411 |
+
print(f"get_stock_index_history: {result}")
|
412 |
+
result = find_stock_codes_or_names([('苹果', 'ORG'), ('苹果公司', 'ORG')])
|
413 |
+
print(f"find_stock_codes_or_names: {result}")
|
414 |
+
result = process_history(get_stock_history('AAPL', '20240214'), '20240214')
|
415 |
+
print(f"process_history: {result}")
|
416 |
+
result = process_history(get_stock_index_history('AAPL', '20240214'), '20240214')
|
417 |
+
print(f"process_history: {result}")
|
418 |
+
pass
|