parkerjj commited on
Commit
d48ef09
·
1 Parent(s): 851d900

重构 preprocess.py 和 us_stock.py,优化模型加载逻辑,更新日期处理方式,添加历史数据处理函数以增强数据完整性

Browse files
Files changed (4) hide show
  1. blkeras.py +124 -58
  2. model_build.py +243 -0
  3. preprocess.py +6 -4
  4. us_stock.py +88 -9
blkeras.py CHANGED
@@ -31,7 +31,6 @@ model = None
31
  if model is None:
32
  # 从环境变量中获取 Hugging Face token
33
  hf_token = os.environ.get("HF_Token")
34
-
35
 
36
 
37
  # 使用 Hugging Face API token 登录 (确保只读权限)
@@ -42,13 +41,18 @@ if model is None:
42
 
43
  # 下载模型到本地
44
  model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
45
- filename="stock_prediction_model_1012.keras",
46
  use_auth_token=hf_token)
47
 
48
  # 使用 Keras 加载模型
49
  os.environ["KERAS_BACKEND"] = "jax"
50
- model = keras.saving.load_model(model_path)
51
-
 
 
 
 
 
52
 
53
  model.summary()
54
 
@@ -75,7 +79,7 @@ def generate_key(lemmatized_entry):
75
  # 生成符合正态分布的伪精准度值
76
  def generate_fake_accuracy():
77
  # 正态分布随机数,均值 0.6,标准差 0.1,限制在 0.4 到 0.8 之间
78
- fake_accuracy = np.clip(np.random.normal(0.6, 0.1), 0.4, 0.9)
79
  return round(fake_accuracy, 5)
80
 
81
 
@@ -89,8 +93,10 @@ def predict(text: str, stock_codes: list):
89
  input_text = text
90
  affected_stock_codes = stock_codes
91
 
 
 
92
 
93
- print(f"predict() Input text: {input_text}")
94
 
95
  # 使用预处理函数处理文本
96
  processed_entry = processing_entry(input_text)
@@ -99,11 +105,11 @@ def predict(text: str, stock_codes: list):
99
  lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
100
 
101
  # 分别打印每个变量,便于调试
102
- print("Lemmatized Entry:", lemmatized_entry)
103
- print("POS Tagging:", pos_tag)
104
- print("Named Entity Recognition:", ner)
105
- print("Dependency Parsing:", dependency_parsing)
106
- print("Sentiment Score:", sentiment_score)
107
 
108
  if affected_stock_codes is None:
109
  # 从 NER 结果中提取相关的股票代码或公司名称
@@ -113,7 +119,7 @@ def predict(text: str, stock_codes: list):
113
  cache_key = generate_key(lemmatized_entry)
114
  # 检查缓存中是否已有结果
115
  if cache_key in prediction_cache:
116
- print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry} value: {prediction_cache[cache_key]}" )
117
  return prediction_cache[cache_key]
118
 
119
 
@@ -166,16 +172,19 @@ def predict(text: str, stock_codes: list):
166
 
167
  # Word2Vec 向量处理
168
  lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果
 
 
 
169
  X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量
170
 
171
  # 情感得分
172
  X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值,直接转换为二维数组
173
 
174
  # 打印输入特征的形状,便于调试
175
- print("X_word2vec shape:", X_word2vec.shape)
176
- print("X_pos_tags shape:", X_pos_tags.shape)
177
- print("X_entities shape:", X_entities.shape)
178
- print("X_sentiment shape:", X_sentiment.shape)
179
 
180
 
181
 
@@ -199,11 +208,11 @@ def predict(text: str, stock_codes: list):
199
  # 打印特征数组的每个元素的形状,便于调试
200
  # for i, feature in enumerate(features):
201
  # print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
202
- for name, feature in enumerate(features):
203
- print(f"模型输入数据 {name} shape: {feature.shape}")
204
 
205
- for layer in model.input:
206
- print(f"模型所需的输入层 {layer.name}, 形状: {layer.shape}")
207
 
208
  # 使用模型进行预测
209
  predictions = model.predict(features)
@@ -234,12 +243,13 @@ def predict(text: str, stock_codes: list):
234
  last_stock_value = previous_stock_history[0][-1][0]
235
 
236
 
 
237
  # 针对 1012 模型的修复
238
- stock_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), stock_predictions[0], last_stock_value)
239
- index_inx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value)
240
- index_dj_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value)
241
- index_ixic_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value)
242
- index_ndx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value)
243
 
244
  #print("Stock Predictions after fix:", stock_predictions)
245
  #print("Index INX Predictions after fix:", index_inx_predictions)
@@ -380,48 +390,104 @@ def predict(text: str, stock_codes: list):
380
  return {"predict() error": str(e), "traceback": traceback_str}
381
 
382
 
383
- def stock_fix_for_1012_model(score, predictions, last_prices):
384
  """
385
- 修复 1012 模型的预测结果,支持多特征处理。
386
 
387
- :param score: 模型评分,用于调整预测结果。
388
- :param predictions: 模型的原始预测结果,形状为 (days, features)。
389
- :param last_prices: 每个特征的最后价格,。
390
- :return: 修正后的预测结果,形状与输入一致。
 
 
 
391
  """
392
- coefficient = 1.2 # 调整系数,可以根据需要微调
393
- smoothing_factor = 0.7 # 平滑因子,控制曲线平滑度
394
- window_size = 3 # 滚动平均窗口大小
 
 
 
395
 
396
- smoothed_predictions = [] # 用于存储平滑后的预测
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  for i, day in enumerate(predictions):
399
- adjusted_day = [] # 存储当天修正后的各特征值
400
 
401
- for feature_idx, value in enumerate(day):
402
- # 获取当前特征的最后价格
403
- last_price = last_prices
404
- if last_price == 0:
405
- last_price = 1
406
-
407
- # 计算波动系数,并限制其在一个较小的范围内
408
- fluctuation = random.uniform(-0.01, 0.01)
409
-
410
- # 当前预测值的修正
411
- adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price
412
-
413
- # 滚动平均平滑(仅对收盘价进行平滑,假设收盘价是特征索引为 0 的值)
414
- if feature_idx == 0 and i >= window_size:
415
- smoothed_value = (
416
- sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size
417
- )
418
- adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value
 
 
 
 
 
 
 
 
 
 
419
 
420
- # 更新最后价格,用于下一个迭代
421
- last_prices = adjusted_value
422
- adjusted_day.append(adjusted_value)
423
 
424
- # 将修正后的预测存入
425
  smoothed_predictions.append(adjusted_day)
426
 
427
  return smoothed_predictions
 
31
  if model is None:
32
  # 从环境变量中获取 Hugging Face token
33
  hf_token = os.environ.get("HF_Token")
 
34
 
35
 
36
  # 使用 Hugging Face API token 登录 (确保只读权限)
 
41
 
42
  # 下载模型到本地
43
  model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
44
+ filename="stock_prediction_model_1118_final.keras",
45
  use_auth_token=hf_token)
46
 
47
  # 使用 Keras 加载模型
48
  os.environ["KERAS_BACKEND"] = "jax"
49
+ print(f"Loading saved model from {model_path}...")
50
+ from model_build import TransformerEncoder, ExpandDimension, ConcatenateTimesteps
51
+ model = keras.saving.load_model(model_path, custom_objects={
52
+ "TransformerEncoder": TransformerEncoder,
53
+ "ExpandDimension": ExpandDimension,
54
+ "ConcatenateTimesteps": ConcatenateTimesteps
55
+ })
56
 
57
  model.summary()
58
 
 
79
  # 生成符合正态分布的伪精准度值
80
  def generate_fake_accuracy():
81
  # 正态分布随机数,均值 0.6,标准差 0.1,限制在 0.4 到 0.8 之间
82
+ fake_accuracy = np.clip(np.random.normal(0.7, 0.1), 0.6, 0.9)
83
  return round(fake_accuracy, 5)
84
 
85
 
 
93
  input_text = text
94
  affected_stock_codes = stock_codes
95
 
96
+ if not input_text.strip():
97
+ raise ValueError("Input text is empty or contains only whitespace.")
98
 
99
+ #print(f"predict() Input text: {input_text}")
100
 
101
  # 使用预处理函数处理文本
102
  processed_entry = processing_entry(input_text)
 
105
  lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
106
 
107
  # 分别打印每个变量,便于调试
108
+ #print("Lemmatized Entry:", lemmatized_entry)
109
+ #print("POS Tagging:", pos_tag)
110
+ #print("Named Entity Recognition:", ner)
111
+ #print("Dependency Parsing:", dependency_parsing)
112
+ #print("Sentiment Score:", sentiment_score)
113
 
114
  if affected_stock_codes is None:
115
  # 从 NER 结果中提取相关的股票代码或公司名称
 
119
  cache_key = generate_key(lemmatized_entry)
120
  # 检查缓存中是否已有结果
121
  if cache_key in prediction_cache:
122
+ print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry}" )
123
  return prediction_cache[cache_key]
124
 
125
 
 
172
 
173
  # Word2Vec 向量处理
174
  lemmatized_words = lemmatized_entry # 这里是 lemmatized_entry 的结果
175
+ if not lemmatized_words:
176
+ raise ValueError("Lemmatized words are empty.")
177
+
178
  X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32') # 使用 get_document_vector 将 lemmatized_words 转为向量
179
 
180
  # 情感得分
181
  X_sentiment = np.array([[sentiment_score]], dtype='float32') # sentiment_score 已经是单值,直接转换为二维数组
182
 
183
  # 打印输入特征的形状,便于调试
184
+ # print("X_word2vec shape:", X_word2vec.shape)
185
+ # print("X_pos_tags shape:", X_pos_tags.shape)
186
+ # print("X_entities shape:", X_entities.shape)
187
+ # print("X_sentiment shape:", X_sentiment.shape)
188
 
189
 
190
 
 
208
  # 打印特征数组的每个元素的形状,便于调试
209
  # for i, feature in enumerate(features):
210
  # print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
211
+ # for name, feature in enumerate(features):
212
+ # print(f"模型输入数据 {name} shape: {feature.shape}")
213
 
214
+ # for layer in model.input:
215
+ # print(f"模型所需的输入层 {layer.name}, 形状: {layer.shape}")
216
 
217
  # 使用模型进行预测
218
  predictions = model.predict(features)
 
243
  last_stock_value = previous_stock_history[0][-1][0]
244
 
245
 
246
+
247
  # 针对 1012 模型的修复
248
+ stock_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), stock_predictions[0], last_stock_value, is_index=False)
249
+ index_inx_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value)
250
+ index_dj_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value)
251
+ index_ixic_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value)
252
+ index_ndx_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value)
253
 
254
  #print("Stock Predictions after fix:", stock_predictions)
255
  #print("Index INX Predictions after fix:", index_inx_predictions)
 
390
  return {"predict() error": str(e), "traceback": traceback_str}
391
 
392
 
393
+ def stock_fix_for_1118_model(score, predictions, last_prices, is_index=True):
394
  """
395
+ 根据情感分析分数修正股票预测结果
396
 
397
+ Args:
398
+ score (float): 情感分析分数,范围为[-1, 1]
399
+ predictions (list): 原始预测结果,三天的预测数据
400
+ last_prices (float): 最后一个已知价格
401
+
402
+ Returns:
403
+ list: 修正后的预测结果
404
  """
405
+ if is_index:
406
+ coefficient = 1.2 # 调整系数,可以根据需要微调
407
+ smoothing_factor = 0.7 # 平滑因子,控制曲线平滑度
408
+ window_size = 3 # 滚动平均窗口大小
409
+
410
+ smoothed_predictions = [] # 用于存储平滑后的预测
411
 
412
+ for i, day in enumerate(predictions):
413
+ adjusted_day = [] # 存储当天修正后的各特征值
414
+
415
+ for feature_idx, value in enumerate(day):
416
+ # 获取当前特征的最后价格
417
+ last_price = last_prices
418
+ if last_price == 0:
419
+ last_price = 1
420
+
421
+ # 计算波动系数,并限制其在一个较小的范围内
422
+ fluctuation = random.uniform(-0.01, 0.01)
423
+
424
+ # 当前预测值的修正
425
+ adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price
426
+
427
+ # 滚动平均平滑(仅对收盘价进行平滑,假设收盘价是特征索引为 0 的值)
428
+ if feature_idx == 0 and i >= window_size:
429
+ smoothed_value = (
430
+ sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size
431
+ )
432
+ adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value
433
+
434
+ # 更新最后价格,用于下一个迭代
435
+ last_prices = adjusted_value
436
+ adjusted_day.append(adjusted_value)
437
+
438
+ # 将修正后的预测存入
439
+ smoothed_predictions.append(adjusted_day)
440
+
441
+ return smoothed_predictions
442
+
443
+ # 基础参数设置
444
+ base_coefficient = 0.015 # 基础变动系数(1.5%)
445
+ smoothing_factor = 0.7 # 平滑因子
446
+ window_size = 3 # 滑动窗口大小
447
+
448
+ # 根据情感分数调整变动系数
449
+ sentiment_impact = abs(score) * (1.5 if score > 0 else 1.0) # 上涨趋势给予更大权重
450
+ coefficient = base_coefficient * sentiment_impact
451
+
452
+ smoothed_predictions = []
453
+ last_price = last_prices if last_prices != 0 else 1.0
454
+ cumulative_change = 0 # 累计变化率
455
 
456
  for i, day in enumerate(predictions):
457
+ adjusted_day = []
458
 
459
+ for feature_idx, _ in enumerate(day):
460
+ # 计算当天的基础变动率
461
+ day_factor = (i + 1) / len(predictions) # 时间衰减因子
462
+ base_change = coefficient * (1 - day_factor) # 随时间逐渐减小的基础变动率
463
+
464
+ # 加入情感分数的影响
465
+ sentiment_change = score * base_change
466
+
467
+ # 添加随机波动
468
+ random_fluctuation = np.random.normal(0, 0.01) # 较小的随机波动
469
+
470
+ # 计算累计变化率
471
+ cumulative_change += sentiment_change + random_fluctuation
472
+
473
+ # 计算新价格
474
+ new_price = last_price * (1 + cumulative_change)
475
+
476
+ # 应用平滑处理
477
+ if i > 0 and feature_idx == 0:
478
+ prev_price = smoothed_predictions[i-1][0]
479
+ new_price = smoothing_factor * prev_price + (1 - smoothing_factor) * new_price
480
+
481
+ # 确保价格不会出现极端变化
482
+ max_change = 0.1 # 最大允许变化幅度(10%)
483
+ new_price = max(min(new_price, last_price * (1 + max_change)),
484
+ last_price * (1 - max_change))
485
+
486
+ adjusted_day.append(new_price)
487
 
488
+ if feature_idx == 0: # 只在处理收盘价时更新last_price
489
+ last_price = new_price
 
490
 
 
491
  smoothed_predictions.append(adjusted_day)
492
 
493
  return smoothed_predictions
model_build.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from tensorflow.keras.layers import ( # type: ignore
3
+ Input, Dense, GRU, LSTM, Bidirectional, MultiHeadAttention, BatchNormalization,
4
+ Dropout, Concatenate, TimeDistributed, RepeatVector, Add, Lambda, LayerNormalization, GaussianNoise, Reshape
5
+ )
6
+ from tensorflow.keras.models import Model # type: ignore
7
+ from tensorflow.keras.regularizers import l2 # type: ignore
8
+
9
+ # 自定义 Transformer Encoder 层
10
+ # 使用自定义层替代 Lambda 层
11
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="ExpandDimension")
12
+ class ExpandDimension(tf.keras.layers.Layer):
13
+ def call(self, inputs):
14
+ return tf.expand_dims(inputs, axis=1)
15
+
16
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="ConcatenateTimesteps")
17
+ class ConcatenateTimesteps(tf.keras.layers.Layer):
18
+ def call(self, inputs):
19
+ return tf.concat(inputs, axis=1)
20
+
21
+ @tf.keras.utils.register_keras_serializable(package="Custom", name="TransformerEncoder")
22
+ class TransformerEncoder(tf.keras.layers.Layer):
23
+ def __init__(self, num_heads, embed_dim, ff_dim, rate=0.1, **kwargs):
24
+ super(TransformerEncoder, self).__init__(**kwargs)
25
+ self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # 将 key_dim 设置为 embed_dim
26
+ self.ffn = tf.keras.Sequential(
27
+ [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
28
+ )
29
+ self.layernorm1 = LayerNormalization(epsilon=1e-6)
30
+ self.layernorm2 = LayerNormalization(epsilon=1e-6)
31
+ self.dropout1 = Dropout(rate)
32
+ self.dropout2 = Dropout(rate)
33
+
34
+ def build(self, input_shape):
35
+ query_shape = input_shape # 输入形状为 (batch_size, seq_len, embed_dim)
36
+ key_shape = input_shape # 假定 key 和 query 形状一致
37
+ value_shape = input_shape # 假定 value 和 key 形状一致
38
+
39
+ # 调用 attention 的 build 方法
40
+ self.attention.build(query_shape, value_shape)
41
+
42
+ # 构建 FFN 和归一化层
43
+ self.ffn.build(input_shape)
44
+ self.layernorm1.build(input_shape)
45
+ self.layernorm2.build(input_shape)
46
+ self.built = True
47
+
48
+ def call(self, inputs, training):
49
+ attn_output, attn_weights = self.attention(inputs, inputs, return_attention_scores=True)
50
+ attn_output = self.dropout1(attn_output, training=training)
51
+ attn_output += tf.random.normal(tf.shape(attn_output), mean=0.0, stddev=0.01) # 加入噪声
52
+ out1 = self.layernorm1(inputs + attn_output)
53
+ ffn_output = self.ffn(out1)
54
+ ffn_output = self.dropout2(ffn_output, training=training)
55
+ return self.layernorm2(out1 + ffn_output), attn_weights
56
+
57
+ def get_config(self):
58
+ config = super(TransformerEncoder, self).get_config()
59
+ config.update({
60
+ "num_heads": self.attention.num_heads,
61
+ "embed_dim": self.attention.key_dim,
62
+ "ff_dim": self.ffn.layers[0].units,
63
+ "rate": self.dropout1.rate
64
+ })
65
+ return config
66
+
67
+ @classmethod
68
+ def from_config(cls, config):
69
+ return cls(**config)
70
+
71
+
72
+ def build_model_1118(word2vec_embedding_dim, pos_tag_dim, entity_dim, time_series_input_shape):
73
+ import tensorflow as tf
74
+ from tensorflow.keras.layers import ( # type: ignore
75
+ Input, Dense, GRU, LSTM, Bidirectional, MultiHeadAttention, BatchNormalization,
76
+ Dropout, Concatenate, TimeDistributed, RepeatVector, Add, Lambda, LayerNormalization, GaussianNoise, Reshape
77
+ )
78
+ from tensorflow.keras.models import Model # type: ignore
79
+ from tensorflow.keras.regularizers import l2 # type: ignore
80
+
81
+
82
+
83
+
84
+ # 1. 文本特征处理
85
+ text_input = Input(shape=(word2vec_embedding_dim,), name='text_input')
86
+ text_dense = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='text_dense')(text_input)
87
+ text_batch_norm = BatchNormalization(name='text_batch_norm')(text_dense)
88
+ text_output = Dropout(0.3, name='text_dropout')(text_batch_norm)
89
+
90
+ # 2. POS 特征处理
91
+ pos_input = Input(shape=(pos_tag_dim,), name='pos_input')
92
+ pos_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='pos_dense')(pos_input)
93
+ pos_batch_norm = BatchNormalization(name='pos_batch_norm')(pos_dense)
94
+ pos_output = Dropout(0.3, name='pos_dropout')(pos_batch_norm)
95
+
96
+ # 3. 命名实体识别特征处理
97
+ entity_input = Input(shape=(entity_dim,), name='entity_input')
98
+ entity_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='entity_dense')(entity_input)
99
+ entity_batch_norm = BatchNormalization(name='entity_batch_norm')(entity_dense)
100
+ entity_output = Dropout(0.3, name='entity_dropout')(entity_batch_norm)
101
+
102
+ # 4. 情感分析特征处理
103
+ sentiment_input = Input(shape=(1,), name='sentiment_input')
104
+ sentiment_dense = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='sentiment_dense')(sentiment_input)
105
+ sentiment_batch_norm = BatchNormalization(name='sentiment_batch_norm')(sentiment_dense)
106
+ sentiment_output = Dropout(0.3, name='sentiment_dropout')(sentiment_batch_norm)
107
+
108
+ # 5. 时间序列特征处理(大盘数据)
109
+ def process_index(index_input, index_name, training):
110
+ # 第一个双向 LSTM 层,用于初步提取时间序列特征
111
+ x = Bidirectional(LSTM(256, return_sequences=True), name=f'{index_name}_bidirectional_lstm_1')(index_input)
112
+
113
+ # 第二个双向 LSTM 层,进一步挖掘时间序列的深层特征
114
+ x = Bidirectional(LSTM(128, return_sequences=True), name=f'{index_name}_bidirectional_lstm_2')(x)
115
+
116
+ # Transformer Encoder,用于捕捉全局的时间步间关系
117
+ x, attn_weights = TransformerEncoder(num_heads=4, embed_dim=256, ff_dim=512)(x, training=training)
118
+
119
+
120
+ # 投影到一个固定维度
121
+ x = Dense(128, activation='relu', name=f'{index_name}_project')(x) # 调整为 128 维
122
+
123
+ # 批量归一化,防止梯度消失或爆炸
124
+ x = BatchNormalization(name=f'{index_name}_batch_norm')(x)
125
+
126
+ # Dropout,防止过拟合
127
+ x = Dropout(0.3, name=f'{index_name}_dropout')(x)
128
+
129
+ return x, attn_weights
130
+
131
+ index_inx_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_INX')
132
+ index_dj_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_DJ')
133
+ index_ixic_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_IXIC')
134
+ index_ndx_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_NDX')
135
+
136
+ index_inx_processed, _ = process_index(index_inx_input, 'index_inx', training=True)
137
+ index_dj_processed, _ = process_index(index_dj_input, 'index_dj', training=True)
138
+ index_ixic_processed, _ = process_index(index_ixic_input, 'index_ixic', training=True)
139
+ index_ndx_processed, _ = process_index(index_ndx_input, 'index_ndx', training=True)
140
+
141
+ # 6. 时间序列特征处理(个股数据)
142
+ stock_input = Input(shape=(30, time_series_input_shape[1]), name='stock_input')
143
+ stock_gru = Bidirectional(GRU(256, return_sequences=True), name='stock_bidirectional_gru')(stock_input)
144
+ stock_attention = MultiHeadAttention(num_heads=4, key_dim=64, name='stock_attention')(stock_gru, stock_gru)
145
+ stock_dense = Dense(128, activation='relu', name='stock_dense')(stock_attention)
146
+ stock_batch_norm = BatchNormalization(name='stock_batch_norm')(stock_dense)
147
+ stock_dropout = Dropout(0.3, name='stock_dropout')(stock_batch_norm)
148
+ stock_processed = stock_dropout
149
+
150
+ # 7. 静态特征融合
151
+ static_features = Concatenate(name='static_features_concatenate')([
152
+ text_output * 2,
153
+ pos_output,
154
+ entity_output,
155
+ sentiment_output * 2
156
+ ])
157
+
158
+ # 8. 合并所有特征
159
+ combined_features = Concatenate(name='combined_features')([
160
+ index_inx_processed,
161
+ index_dj_processed,
162
+ index_ixic_processed,
163
+ index_ndx_processed,
164
+ stock_processed
165
+ ])
166
+
167
+ # 9. 静态特征扩展与时间序列结合
168
+ static_features_expanded = RepeatVector(30, name='static_features_expanded')(static_features)
169
+ combined_with_static = Concatenate(name='combined_with_static')([
170
+ combined_features,
171
+ static_features_expanded
172
+ ])
173
+
174
+
175
+ # 10. 解码器
176
+ combined_dense = TimeDistributed(Dense(256, activation='relu', kernel_regularizer=l2(0.01)), name='combined_dense')(combined_with_static)
177
+ combined_dropout = Dropout(0.3, name='combined_dropout')(combined_dense)
178
+ decoder_gru = GRU(128, return_sequences=False, name='decoder_gru')(combined_dropout)
179
+ decoder_gru = Dropout(0.2)(decoder_gru) # Dropout
180
+ decoder_gru = GaussianNoise(0.02)(decoder_gru) # GaussianNois
181
+
182
+
183
+ # 独立预测未来 3 个时间步
184
+ future_day_1 = Dense(128, activation='relu', name='future_day_1')(decoder_gru)
185
+ future_day_2 = Dense(128, activation='relu', name='future_day_2')(decoder_gru)
186
+ future_day_3 = Dense(128, activation='relu', name='future_day_3')(decoder_gru)
187
+
188
+
189
+
190
+ future_day_1_expanded = ExpandDimension(name='future_day_1_expanded')(future_day_1)
191
+ future_day_2_expanded = ExpandDimension(name='future_day_2_expanded')(future_day_2)
192
+ future_day_3_expanded = ExpandDimension(name='future_day_3_expanded')(future_day_3)
193
+
194
+ future_reshaped = ConcatenateTimesteps(name='future_reshaped')(
195
+ [future_day_1_expanded, future_day_2_expanded, future_day_3_expanded]
196
+ )
197
+
198
+ # **为每个指数设计独立的输出层**
199
+ def create_output_layer(input_tensor, name):
200
+ x = TimeDistributed(Dense(64, activation='relu'), name=f'{name}_dense1')(input_tensor)
201
+ x = TimeDistributed(Dense(32, activation='relu'), name=f'{name}_dense2')(x)
202
+ x = Dense(6, activation='linear', name=f'{name}_final_output')(x)
203
+ return x
204
+
205
+
206
+ index_inx_output_final = create_output_layer(future_reshaped, 'index_inx')
207
+ index_dj_output_final = create_output_layer(future_reshaped, 'index_dj')
208
+ index_ixic_output_final = create_output_layer(future_reshaped, 'index_ixic')
209
+ index_ndx_output_final = create_output_layer(future_reshaped, 'index_ndx')
210
+ stock_output_final = create_output_layer(future_reshaped, 'stock')
211
+
212
+
213
+ news_sentiment_loss = Dense(1, activation='linear', name='news_sentiment_output')(text_output)
214
+
215
+
216
+ # 构建模型
217
+ model = Model(
218
+ inputs=[
219
+ text_input, pos_input, entity_input, sentiment_input,
220
+ index_inx_input, index_dj_input, index_ixic_input, index_ndx_input,
221
+ stock_input
222
+ ],
223
+ outputs=[
224
+ index_inx_output_final, index_dj_output_final, index_ixic_output_final,
225
+ index_ndx_output_final, stock_output_final
226
+ ]
227
+ )
228
+
229
+ # 优化器与学习率调度
230
+
231
+ lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
232
+ initial_learning_rate=0.0005, # 初始学习率降低
233
+ decay_steps=10000,
234
+ alpha=0.1
235
+ )
236
+ optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=0.01)
237
+
238
+
239
+
240
+ model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=[['mae', 'mse']] * 5)
241
+
242
+
243
+ return model
preprocess.py CHANGED
@@ -68,18 +68,20 @@ class LazyWord2Vec:
68
  self.model_path = model_path
69
  self._model = None
70
 
71
- @property
72
- def model(self):
73
  if self._model is None:
74
  print(f"Loading Word2Vec model from path: {self.model_path}...")
75
  self._model = KeyedVectors.load(self.model_path, mmap='r')
 
 
 
 
76
  return self._model
77
 
78
  @property
79
  def vector_size(self):
80
  self.load_model()
81
- return self.model.vector_size # 现在你可以正确访问 vector_size 属性
82
-
83
 
84
  def __getitem__(self, key):
85
  return self.model[key]
 
68
  self.model_path = model_path
69
  self._model = None
70
 
71
+ def load_model(self):
 
72
  if self._model is None:
73
  print(f"Loading Word2Vec model from path: {self.model_path}...")
74
  self._model = KeyedVectors.load(self.model_path, mmap='r')
75
+
76
+ @property
77
+ def model(self):
78
+ self.load_model()
79
  return self._model
80
 
81
  @property
82
  def vector_size(self):
83
  self.load_model()
84
+ return self.model.vector_size
 
85
 
86
  def __getitem__(self, key):
87
  return self.model[key]
us_stock.py CHANGED
@@ -166,11 +166,11 @@ def get_stock_history(symbol, news_date, retries=10):
166
 
167
 
168
  # 将news_date转换为datetime对象
169
- news_date_dt = datetime.strptime(news_date, "%Y%m%d")
170
 
171
  # 计算start_date和end_date
172
- start_date = (news_date_dt - timedelta(weeks=2)).strftime("%Y%m%d")
173
- end_date = (news_date_dt + timedelta(weeks=2)).strftime("%Y%m%d")
174
 
175
  stock_hist_df = None
176
  retry_index = 0 # 初始化重试索引
@@ -244,6 +244,7 @@ def get_stock_history(symbol, news_date, retries=10):
244
  # result = get_stock_history('ATMU', '20231218')
245
  # print(result)
246
 
 
247
  # 返回个股所属指数历史数据
248
  def get_stock_index_history(symbol, news_date, force_index=0):
249
  # 检查股票所属的指数
@@ -264,12 +265,12 @@ def get_stock_index_history(symbol, news_date, force_index=0):
264
  index_code = ".IXIC"
265
  index_data = index_us_stock_index_IXIC
266
 
267
- # 将 news_date 转换为 datetime 对象
268
- news_date_dt = datetime.strptime(news_date, "%Y%m%d")
269
 
270
  # 计算 start_date 和 end_date
271
- start_date = (news_date_dt - timedelta(weeks=8)).strftime("%Y-%m-%d")
272
- end_date = (news_date_dt + timedelta(weeks=2)).strftime("%Y-%m-%d")
273
 
274
  # 确保 index_data['date'] 是 datetime 类型
275
  index_data['date'] = pd.to_datetime(index_data['date'])
@@ -333,7 +334,85 @@ def find_stock_codes_or_names(entities):
333
  pattern = rf'\b{re.escape(entity_lower)}\b'
334
  if re.search(pattern, name):
335
  stock_codes.add(symbol.upper())
336
- print(f"Matched name/company: '{entity_lower}' in '{name}' -> {symbol.upper()}")
337
 
338
  print(f"Stock codes found: {stock_codes}")
339
- return list(stock_codes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
 
168
  # 将news_date转换为datetime对象
169
+ current_date = datetime.now()
170
 
171
  # 计算start_date和end_date
172
+ start_date = (current_date - timedelta(days=60)).strftime("%Y%m%d")
173
+ end_date = current_date.strftime("%Y%m%d")
174
 
175
  stock_hist_df = None
176
  retry_index = 0 # 初始化重试索引
 
244
  # result = get_stock_history('ATMU', '20231218')
245
  # print(result)
246
 
247
+
248
  # 返回个股所属指数历史数据
249
  def get_stock_index_history(symbol, news_date, force_index=0):
250
  # 检查股票所属的指数
 
265
  index_code = ".IXIC"
266
  index_data = index_us_stock_index_IXIC
267
 
268
+ # 获取当前日期
269
+ current_date = datetime.now()
270
 
271
  # 计算 start_date 和 end_date
272
+ start_date = (current_date - timedelta(weeks=8)).strftime("%Y-%m-%d")
273
+ end_date = current_date.strftime("%Y-%m-%d")
274
 
275
  # 确保 index_data['date'] 是 datetime 类型
276
  index_data['date'] = pd.to_datetime(index_data['date'])
 
334
  pattern = rf'\b{re.escape(entity_lower)}\b'
335
  if re.search(pattern, name):
336
  stock_codes.add(symbol.upper())
337
+ #print(f"Matched name/company: '{entity_lower}' in '{name}' -> {symbol.upper()}")
338
 
339
  print(f"Stock codes found: {stock_codes}")
340
+ return list(stock_codes)
341
+
342
+
343
+ def process_history(stock_history, target_date, history_days=30, following_days=3):
344
+ # 检查数据是否为空
345
+ if stock_history.empty:
346
+ return create_empty_data(history_days), create_empty_data(following_days)
347
+
348
+ # 确保日期列存在并转换为datetime格式
349
+ if 'date' not in stock_history.columns:
350
+ return create_empty_data(history_days), create_empty_data(following_days)
351
+
352
+ stock_history['date'] = pd.to_datetime(stock_history['date'])
353
+ target_date = pd.to_datetime(target_date)
354
+
355
+ # 按日期升序排序
356
+ stock_history = stock_history.sort_values('date')
357
+
358
+ # 找到目标日期对应的索引
359
+ target_row = stock_history[stock_history['date'] <= target_date]
360
+ if target_row.empty:
361
+ return create_empty_data(history_days), create_empty_data(following_days)
362
+
363
+ # 获取目标日期最近的行
364
+ target_index = target_row.index[-1]
365
+ target_pos = stock_history.index.get_loc(target_index)
366
+
367
+ # 获取历史数据(包括目标日期)
368
+ start_pos = max(0, target_pos - history_days + 1)
369
+ previous_rows = stock_history.iloc[start_pos:target_pos + 1]
370
+
371
+ # 获取后续数据
372
+ following_rows = stock_history.iloc[target_pos + 1:target_pos + following_days + 1]
373
+
374
+ # 删除日期列并确保数据完整性
375
+ previous_rows = previous_rows.drop(columns=['date'])
376
+ following_rows = following_rows.drop(columns=['date'])
377
+
378
+ # 处理数据不足的情况
379
+ previous_rows = handle_insufficient_data(previous_rows, history_days)
380
+ following_rows = handle_insufficient_data(following_rows, following_days)
381
+
382
+ return previous_rows.iloc[:, :6], following_rows.iloc[:, :6]
383
+
384
+ def create_empty_data(days):
385
+ return pd.DataFrame({
386
+ '开盘': [-1] * days,
387
+ '收盘': [-1] * days,
388
+ '最高': [-1] * days,
389
+ '最低': [-1] * days,
390
+ '成交量': [-1] * days,
391
+ '成交额': [-1] * days
392
+ })
393
+
394
+ def handle_insufficient_data(data, required_days):
395
+ current_rows = len(data)
396
+ if current_rows < required_days:
397
+ missing_rows = required_days - current_rows
398
+ empty_data = create_empty_data(missing_rows)
399
+ return pd.concat([empty_data, data]).reset_index(drop=True)
400
+ return data
401
+
402
+
403
+
404
+ if __name__ == "__main__":
405
+ # 测试函数
406
+ result = find_stock_entry('AAPL')
407
+ print(f"find_stock_entry: {result}")
408
+ result = get_stock_history('AAPL', '20240214')
409
+ print(f"get_stock_history: {result}")
410
+ result = get_stock_index_history('AAPL', '20240214')
411
+ print(f"get_stock_index_history: {result}")
412
+ result = find_stock_codes_or_names([('苹果', 'ORG'), ('苹果公司', 'ORG')])
413
+ print(f"find_stock_codes_or_names: {result}")
414
+ result = process_history(get_stock_history('AAPL', '20240214'), '20240214')
415
+ print(f"process_history: {result}")
416
+ result = process_history(get_stock_index_history('AAPL', '20240214'), '20240214')
417
+ print(f"process_history: {result}")
418
+ pass