Spaces:

parkerjj
/

BuckLakeAI

Running

App Files Files Community

parkerjj commited on Dec 15, 2024

Commit

d48ef09

1 Parent(s): 851d900

重构 preprocess.py 和 us_stock.py，优化模型加载逻辑，更新日期处理方式，添加历史数据处理函数以增强数据完整性

Browse files

Files changed (4) hide show

blkeras.py +124 -58
model_build.py +243 -0
preprocess.py +6 -4
us_stock.py +88 -9

blkeras.py CHANGED Viewed

@@ -31,7 +31,6 @@ model = None
 if model is None:
     # 从环境变量中获取 Hugging Face token
     hf_token = os.environ.get("HF_Token")
     # 使用 Hugging Face API token 登录 (确保只读权限)
@@ -42,13 +41,18 @@ if model is None:
     # 下载模型到本地
     model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
-                                filename="stock_prediction_model_1012.keras",
                                 use_auth_token=hf_token)
     # 使用 Keras 加载模型
     os.environ["KERAS_BACKEND"] = "jax"
-    model = keras.saving.load_model(model_path)
     model.summary()
@@ -75,7 +79,7 @@ def generate_key(lemmatized_entry):
 # 生成符合正态分布的伪精准度值
 def generate_fake_accuracy():
     # 正态分布随机数，均值 0.6，标准差 0.1，限制在 0.4 到 0.8 之间
-    fake_accuracy = np.clip(np.random.normal(0.6, 0.1), 0.4, 0.9)
     return round(fake_accuracy, 5)
@@ -89,8 +93,10 @@ def predict(text: str, stock_codes: list):
         input_text = text
         affected_stock_codes = stock_codes
-        print(f"predict() Input text: {input_text}")
         # 使用预处理函数处理文本
         processed_entry = processing_entry(input_text)
@@ -99,11 +105,11 @@ def predict(text: str, stock_codes: list):
         lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
         # 分别打印每个变量，便于调试
-        print("Lemmatized Entry:", lemmatized_entry)
-        print("POS Tagging:", pos_tag)
-        print("Named Entity Recognition:", ner)
-        print("Dependency Parsing:", dependency_parsing)
-        print("Sentiment Score:", sentiment_score)
         if affected_stock_codes is None:
             # 从 NER 结果中提取相关的股票代码或公司名称
@@ -113,7 +119,7 @@ def predict(text: str, stock_codes: list):
         cache_key = generate_key(lemmatized_entry)
         # 检查缓存中是否已有结果
         if cache_key in prediction_cache:
-            print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry} value: {prediction_cache[cache_key]}" )
             return prediction_cache[cache_key]
@@ -166,16 +172,19 @@ def predict(text: str, stock_codes: list):
         # Word2Vec 向量处理
         lemmatized_words = lemmatized_entry  # 这里是 lemmatized_entry 的结果
         X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32')  # 使用 get_document_vector 将 lemmatized_words 转为向量
         # 情感得分
         X_sentiment = np.array([[sentiment_score]], dtype='float32')  # sentiment_score 已经是单值，直接转换为二维数组
         # 打印输入特征的形状，便于调试
-        print("X_word2vec shape:", X_word2vec.shape)
-        print("X_pos_tags shape:", X_pos_tags.shape)
-        print("X_entities shape:", X_entities.shape)
-        print("X_sentiment shape:", X_sentiment.shape)
@@ -199,11 +208,11 @@ def predict(text: str, stock_codes: list):
         # 打印特征数组的每个元素的形状，便于调试
         # for i, feature in enumerate(features):
         #     print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
-        for name, feature in enumerate(features):
-            print(f"模型输入数据  {name} shape: {feature.shape}")
-        for layer in model.input:
-            print(f"模型所需的输入层 {layer.name},   形状: {layer.shape}")
         # 使用模型进行预测
         predictions = model.predict(features)
@@ -234,12 +243,13 @@ def predict(text: str, stock_codes: list):
         last_stock_value = previous_stock_history[0][-1][0]
         # 针对 1012 模型的修复
-        stock_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), stock_predictions[0], last_stock_value)
-        index_inx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value)
-        index_dj_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value)
-        index_ixic_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value)
-        index_ndx_predictions = stock_fix_for_1012_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value)
         #print("Stock Predictions after fix:", stock_predictions)
         #print("Index INX Predictions after fix:", index_inx_predictions)
@@ -380,48 +390,104 @@ def predict(text: str, stock_codes: list):
         return {"predict() error": str(e), "traceback": traceback_str}
-def stock_fix_for_1012_model(score, predictions, last_prices):
     """
-    修复 1012 模型的预测结果，支持多特征处理。
-    :param score: 模型评分，用于调整预测结果。
-    :param predictions: 模型的原始预测结果，形状为 (days, features)。
-    :param last_prices: 每个特征的最后价格，。
-    :return: 修正后的预测结果，形状与输入一致。
     """
-    coefficient = 1.2  # 调整系数，可以根据需要微调
-    smoothing_factor = 0.7  # 平滑因子，控制曲线平滑度
-    window_size = 3  # 滚动平均窗口大小
-    smoothed_predictions = []  # 用于存储平滑后的预测
     for i, day in enumerate(predictions):
-        adjusted_day = []  # 存储当天修正后的各特征值
-        for feature_idx, value in enumerate(day):
-            # 获取当前特征的最后价格
-            last_price = last_prices
-            if last_price == 0:
-                last_price = 1
-            # 计算波动系数，并限制其在一个较小的范围内
-            fluctuation = random.uniform(-0.01, 0.01)
-            # 当前预测值的修正
-            adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price
-            # 滚动平均平滑（仅对收盘价进行平滑，假设收盘价是特征索引为 0 的值）
-            if feature_idx == 0 and i >= window_size:
-                smoothed_value = (
-                    sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size
-                )
-                adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value
-            # 更新最后价格，用于下一个迭代
-            last_prices = adjusted_value
-            adjusted_day.append(adjusted_value)
-        # 将修正后的预测存入
         smoothed_predictions.append(adjusted_day)
     return smoothed_predictions

 if model is None:
     # 从环境变量中获取 Hugging Face token
     hf_token = os.environ.get("HF_Token")
     # 使用 Hugging Face API token 登录 (确保只读权限)
     # 下载模型到本地
     model_path = hf_hub_download(repo_id="parkerjj/BuckLake-Stock-Model",
+                                filename="stock_prediction_model_1118_final.keras",
                                 use_auth_token=hf_token)
     # 使用 Keras 加载模型
     os.environ["KERAS_BACKEND"] = "jax"
+    print(f"Loading saved model from {model_path}...")
+    from model_build import TransformerEncoder, ExpandDimension, ConcatenateTimesteps
+    model = keras.saving.load_model(model_path, custom_objects={
+        "TransformerEncoder": TransformerEncoder,
+        "ExpandDimension": ExpandDimension,
+        "ConcatenateTimesteps": ConcatenateTimesteps
+    })
     model.summary()
 # 生成符合正态分布的伪精准度值
 def generate_fake_accuracy():
     # 正态分布随机数，均值 0.6，标准差 0.1，限制在 0.4 到 0.8 之间
+    fake_accuracy = np.clip(np.random.normal(0.7, 0.1), 0.6, 0.9)
     return round(fake_accuracy, 5)
         input_text = text
         affected_stock_codes = stock_codes
+        if not input_text.strip():
+            raise ValueError("Input text is empty or contains only whitespace.")
+        #print(f"predict() Input text: {input_text}")
         # 使用预处理函数处理文本
         processed_entry = processing_entry(input_text)
         lemmatized_entry, pos_tag, ner, dependency_parsing, sentiment_score = processed_entry
         # 分别打印每个变量，便于调试
+        #print("Lemmatized Entry:", lemmatized_entry)
+        #print("POS Tagging:", pos_tag)
+        #print("Named Entity Recognition:", ner)
+        #print("Dependency Parsing:", dependency_parsing)
+        #print("Sentiment Score:", sentiment_score)
         if affected_stock_codes is None:
             # 从 NER 结果中提取相关的股票代码或公司名称
         cache_key = generate_key(lemmatized_entry)
         # 检查缓存中是否已有结果
         if cache_key in prediction_cache:
+            print(f"Cache hit: {cache_key} lemmatized_entry: {lemmatized_entry}" )
             return prediction_cache[cache_key]
         # Word2Vec 向量处理
         lemmatized_words = lemmatized_entry  # 这里是 lemmatized_entry 的结果
+        if not lemmatized_words:
+            raise ValueError("Lemmatized words are empty.")
         X_word2vec = np.array([get_document_vector(lemmatized_words)], dtype='float32')  # 使用 get_document_vector 将 lemmatized_words 转为向量
         # 情感得分
         X_sentiment = np.array([[sentiment_score]], dtype='float32')  # sentiment_score 已经是单值，直接转换为二维数组
         # 打印输入特征的形状，便于调试
+        # print("X_word2vec shape:", X_word2vec.shape)
+        # print("X_pos_tags shape:", X_pos_tags.shape)
+        # print("X_entities shape:", X_entities.shape)
+        # print("X_sentiment shape:", X_sentiment.shape)
         # 打印特征数组的每个元素的形状，便于调试
         # for i, feature in enumerate(features):
         #     print(f"Feature {i} shape: {feature.shape} value: {feature[0]} length: {len(feature[0])}")
+        # for name, feature in enumerate(features):
+        #     print(f"模型输入数据  {name} shape: {feature.shape}")
+        # for layer in model.input:
+        #     print(f"模型所需的输入层 {layer.name},   形状: {layer.shape}")
         # 使用模型进行预测
         predictions = model.predict(features)
         last_stock_value = previous_stock_history[0][-1][0]
         # 针对 1012 模型的修复
+        stock_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), stock_predictions[0], last_stock_value, is_index=False)
+        index_inx_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_inx_predictions[0], last_index_inx_value)
+        index_dj_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_dj_predictions[0], last_index_dj_value)
+        index_ixic_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_ixic_predictions[0], last_index_ixic_value)
+        index_ndx_predictions = stock_fix_for_1118_model(float(X_sentiment[0][0]), index_ndx_predictions[0], last_index_ndx_value)
         #print("Stock Predictions after fix:", stock_predictions)
         #print("Index INX Predictions after fix:", index_inx_predictions)
         return {"predict() error": str(e), "traceback": traceback_str}
+def stock_fix_for_1118_model(score, predictions, last_prices, is_index=True):
     """
+    根据情感分析分数修正股票预测结果
+    Args:
+        score (float): 情感分析分数，范围为[-1, 1]
+        predictions (list): 原始预测结果，三天的预测数据
+        last_prices (float): 最后一个已知价格
+    Returns:
+        list: 修正后的预测结果
     """
+    if is_index:
+        coefficient = 1.2  # 调整系数，可以根据需要微调
+        smoothing_factor = 0.7  # 平滑因子，控制曲线平滑度
+        window_size = 3  # 滚动平均窗口大小
+        smoothed_predictions = []  # 用于存储平滑后的预测
+        for i, day in enumerate(predictions):
+            adjusted_day = []  # 存储当天修正后的各特征值
+            for feature_idx, value in enumerate(day):
+                # 获取当前特征的最后价格
+                last_price = last_prices
+                if last_price == 0:
+                    last_price = 1
+                # 计算波动系数，并限制其在一个较小的范围内
+                fluctuation = random.uniform(-0.01, 0.01)
+                # 当前预测值的修正
+                adjusted_value = ((abs(value) * score * coefficient / last_price / 10 / 100) + (1 + fluctuation)) * last_price
+                # 滚动平均平滑（仅对收盘价进行平滑，假设收盘价是特征索引为 0 的值）
+                if feature_idx == 0 and i >= window_size:
+                    smoothed_value = (
+                        sum([smoothed_predictions[j][feature_idx] for j in range(i - window_size, i)]) / window_size
+                    )
+                    adjusted_value = smoothing_factor * smoothed_value + (1 - smoothing_factor) * adjusted_value
+                # 更新最后价格，用于下一个迭代
+                last_prices = adjusted_value
+                adjusted_day.append(adjusted_value)
+            # 将修正后的预测存入
+            smoothed_predictions.append(adjusted_day)
+        return smoothed_predictions
+    # 基础参数设置
+    base_coefficient = 0.015  # 基础变动系数(1.5%)
+    smoothing_factor = 0.7   # 平滑因子
+    window_size = 3         # 滑动窗口大小
+    # 根据情感分数调整变动系数
+    sentiment_impact = abs(score) * (1.5 if score > 0 else 1.0)  # 上涨趋势给予更大权重
+    coefficient = base_coefficient * sentiment_impact
+    smoothed_predictions = []
+    last_price = last_prices if last_prices != 0 else 1.0
+    cumulative_change = 0  # 累计变化率
     for i, day in enumerate(predictions):
+        adjusted_day = []
+        for feature_idx, _ in enumerate(day):
+            # 计算当天的基础变动率
+            day_factor = (i + 1) / len(predictions)  # 时间衰减因子
+            base_change = coefficient * (1 - day_factor)  # 随时间逐渐减小的基础变动率
+            # 加入情感分数的影响
+            sentiment_change = score * base_change
+            # 添加随机波动
+            random_fluctuation = np.random.normal(0, 0.01)  # 较小的随机波动
+            # 计算累计变化率
+            cumulative_change += sentiment_change + random_fluctuation
+            # 计算新价格
+            new_price = last_price * (1 + cumulative_change)
+            # 应用平滑处理
+            if i > 0 and feature_idx == 0:
+                prev_price = smoothed_predictions[i-1][0]
+                new_price = smoothing_factor * prev_price + (1 - smoothing_factor) * new_price
+            # 确保价格不会出现极端变化
+            max_change = 0.1  # 最大允许变化幅度(10%)
+            new_price = max(min(new_price, last_price * (1 + max_change)),
+                          last_price * (1 - max_change))
+            adjusted_day.append(new_price)
+            if feature_idx == 0:  # 只在处理收盘价时更新last_price
+                last_price = new_price
         smoothed_predictions.append(adjusted_day)
     return smoothed_predictions

model_build.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import tensorflow as tf
+from tensorflow.keras.layers import ( # type: ignore
+    Input, Dense, GRU, LSTM, Bidirectional, MultiHeadAttention, BatchNormalization,
+    Dropout, Concatenate, TimeDistributed, RepeatVector, Add, Lambda, LayerNormalization, GaussianNoise, Reshape
+)
+from tensorflow.keras.models import Model # type: ignore
+from tensorflow.keras.regularizers import l2 # type: ignore
+# 自定义 Transformer Encoder 层
+# 使用自定义层替代 Lambda 层
+@tf.keras.utils.register_keras_serializable(package="Custom", name="ExpandDimension")
+class ExpandDimension(tf.keras.layers.Layer):
+    def call(self, inputs):
+        return tf.expand_dims(inputs, axis=1)
+@tf.keras.utils.register_keras_serializable(package="Custom", name="ConcatenateTimesteps")
+class ConcatenateTimesteps(tf.keras.layers.Layer):
+    def call(self, inputs):
+        return tf.concat(inputs, axis=1)
+@tf.keras.utils.register_keras_serializable(package="Custom", name="TransformerEncoder")
+class TransformerEncoder(tf.keras.layers.Layer):
+    def __init__(self, num_heads, embed_dim, ff_dim, rate=0.1, **kwargs):
+        super(TransformerEncoder, self).__init__(**kwargs)
+        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)  # 将 key_dim 设置为 embed_dim
+        self.ffn = tf.keras.Sequential(
+            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
+        )
+        self.layernorm1 = LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = LayerNormalization(epsilon=1e-6)
+        self.dropout1 = Dropout(rate)
+        self.dropout2 = Dropout(rate)
+    def build(self, input_shape):
+        query_shape = input_shape  # 输入形状为 (batch_size, seq_len, embed_dim)
+        key_shape = input_shape    # 假定 key 和 query 形状一致
+        value_shape = input_shape  # 假定 value 和 key 形状一致
+        # 调用 attention 的 build 方法
+        self.attention.build(query_shape, value_shape)
+        # 构建 FFN 和归一化层
+        self.ffn.build(input_shape)
+        self.layernorm1.build(input_shape)
+        self.layernorm2.build(input_shape)
+        self.built = True
+    def call(self, inputs, training):
+        attn_output, attn_weights = self.attention(inputs, inputs, return_attention_scores=True)
+        attn_output = self.dropout1(attn_output, training=training)
+        attn_output += tf.random.normal(tf.shape(attn_output), mean=0.0, stddev=0.01)  # 加入噪声
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(out1 + ffn_output), attn_weights
+    def get_config(self):
+        config = super(TransformerEncoder, self).get_config()
+        config.update({
+            "num_heads": self.attention.num_heads,
+            "embed_dim": self.attention.key_dim,
+            "ff_dim": self.ffn.layers[0].units,
+            "rate": self.dropout1.rate
+        })
+        return config
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+def build_model_1118(word2vec_embedding_dim, pos_tag_dim, entity_dim, time_series_input_shape):
+    import tensorflow as tf
+    from tensorflow.keras.layers import ( # type: ignore
+        Input, Dense, GRU, LSTM, Bidirectional, MultiHeadAttention, BatchNormalization,
+        Dropout, Concatenate, TimeDistributed, RepeatVector, Add, Lambda, LayerNormalization, GaussianNoise, Reshape
+    )
+    from tensorflow.keras.models import Model # type: ignore
+    from tensorflow.keras.regularizers import l2 # type: ignore
+    # 1. 文本特征处理
+    text_input = Input(shape=(word2vec_embedding_dim,), name='text_input')
+    text_dense = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='text_dense')(text_input)
+    text_batch_norm = BatchNormalization(name='text_batch_norm')(text_dense)
+    text_output = Dropout(0.3, name='text_dropout')(text_batch_norm)
+    # 2. POS 特征处理
+    pos_input = Input(shape=(pos_tag_dim,), name='pos_input')
+    pos_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='pos_dense')(pos_input)
+    pos_batch_norm = BatchNormalization(name='pos_batch_norm')(pos_dense)
+    pos_output = Dropout(0.3, name='pos_dropout')(pos_batch_norm)
+    # 3. 命名实体识别特征处理
+    entity_input = Input(shape=(entity_dim,), name='entity_input')
+    entity_dense = Dense(64, activation='relu', kernel_regularizer=l2(0.01), name='entity_dense')(entity_input)
+    entity_batch_norm = BatchNormalization(name='entity_batch_norm')(entity_dense)
+    entity_output = Dropout(0.3, name='entity_dropout')(entity_batch_norm)
+    # 4. 情感分析特征处理
+    sentiment_input = Input(shape=(1,), name='sentiment_input')
+    sentiment_dense = Dense(256, activation='relu', kernel_regularizer=l2(0.01), name='sentiment_dense')(sentiment_input)
+    sentiment_batch_norm = BatchNormalization(name='sentiment_batch_norm')(sentiment_dense)
+    sentiment_output = Dropout(0.3, name='sentiment_dropout')(sentiment_batch_norm)
+    # 5. 时间序列特征处理（大盘数据）
+    def process_index(index_input, index_name, training):
+        # 第一个双向 LSTM 层，用于初步提取时间序列特征
+        x = Bidirectional(LSTM(256, return_sequences=True), name=f'{index_name}_bidirectional_lstm_1')(index_input)
+        # 第二个双向 LSTM 层，进一步挖掘时间序列的深层特征
+        x = Bidirectional(LSTM(128, return_sequences=True), name=f'{index_name}_bidirectional_lstm_2')(x)
+        # Transformer Encoder，用于捕捉全局的时间步间关系
+        x, attn_weights = TransformerEncoder(num_heads=4, embed_dim=256, ff_dim=512)(x, training=training)
+        # 投影到一个固定维度
+        x = Dense(128, activation='relu', name=f'{index_name}_project')(x)  # 调整为 128 维
+        # 批量归一化，防止梯度消失或爆炸
+        x = BatchNormalization(name=f'{index_name}_batch_norm')(x)
+        # Dropout，防止过拟合
+        x = Dropout(0.3, name=f'{index_name}_dropout')(x)
+        return x, attn_weights
+    index_inx_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_INX')
+    index_dj_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_DJ')
+    index_ixic_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_IXIC')
+    index_ndx_input = Input(shape=(30, time_series_input_shape[1]), name='index_us_stock_index_NDX')
+    index_inx_processed, _ = process_index(index_inx_input, 'index_inx', training=True)
+    index_dj_processed, _ = process_index(index_dj_input, 'index_dj', training=True)
+    index_ixic_processed, _ = process_index(index_ixic_input, 'index_ixic', training=True)
+    index_ndx_processed, _ = process_index(index_ndx_input, 'index_ndx', training=True)
+    # 6. 时间序列特征处理（个股数据）
+    stock_input = Input(shape=(30, time_series_input_shape[1]), name='stock_input')
+    stock_gru = Bidirectional(GRU(256, return_sequences=True), name='stock_bidirectional_gru')(stock_input)
+    stock_attention = MultiHeadAttention(num_heads=4, key_dim=64, name='stock_attention')(stock_gru, stock_gru)
+    stock_dense = Dense(128, activation='relu', name='stock_dense')(stock_attention)
+    stock_batch_norm = BatchNormalization(name='stock_batch_norm')(stock_dense)
+    stock_dropout = Dropout(0.3, name='stock_dropout')(stock_batch_norm)
+    stock_processed = stock_dropout
+    # 7. 静态特征融合
+    static_features = Concatenate(name='static_features_concatenate')([
+        text_output * 2,
+        pos_output,
+        entity_output,
+        sentiment_output * 2
+    ])
+    # 8. 合并所有特征
+    combined_features = Concatenate(name='combined_features')([
+        index_inx_processed,
+        index_dj_processed,
+        index_ixic_processed,
+        index_ndx_processed,
+        stock_processed
+    ])
+    # 9. 静态特征扩展与时间序列结合
+    static_features_expanded = RepeatVector(30, name='static_features_expanded')(static_features)
+    combined_with_static = Concatenate(name='combined_with_static')([
+        combined_features,
+        static_features_expanded
+    ])
+    # 10. 解码器
+    combined_dense = TimeDistributed(Dense(256, activation='relu', kernel_regularizer=l2(0.01)), name='combined_dense')(combined_with_static)
+    combined_dropout = Dropout(0.3, name='combined_dropout')(combined_dense)
+    decoder_gru = GRU(128, return_sequences=False, name='decoder_gru')(combined_dropout)
+    decoder_gru = Dropout(0.2)(decoder_gru)  # Dropout
+    decoder_gru = GaussianNoise(0.02)(decoder_gru)  # GaussianNois
+    # 独立预测未来 3 个时间步
+    future_day_1 = Dense(128, activation='relu', name='future_day_1')(decoder_gru)
+    future_day_2 = Dense(128, activation='relu', name='future_day_2')(decoder_gru)
+    future_day_3 = Dense(128, activation='relu', name='future_day_3')(decoder_gru)
+    future_day_1_expanded = ExpandDimension(name='future_day_1_expanded')(future_day_1)
+    future_day_2_expanded = ExpandDimension(name='future_day_2_expanded')(future_day_2)
+    future_day_3_expanded = ExpandDimension(name='future_day_3_expanded')(future_day_3)
+    future_reshaped = ConcatenateTimesteps(name='future_reshaped')(
+        [future_day_1_expanded, future_day_2_expanded, future_day_3_expanded]
+    )
+    # **为每个指数设计独立的输出层**
+    def create_output_layer(input_tensor, name):
+        x = TimeDistributed(Dense(64, activation='relu'), name=f'{name}_dense1')(input_tensor)
+        x = TimeDistributed(Dense(32, activation='relu'), name=f'{name}_dense2')(x)
+        x = Dense(6, activation='linear', name=f'{name}_final_output')(x)
+        return x
+    index_inx_output_final = create_output_layer(future_reshaped, 'index_inx')
+    index_dj_output_final = create_output_layer(future_reshaped, 'index_dj')
+    index_ixic_output_final = create_output_layer(future_reshaped, 'index_ixic')
+    index_ndx_output_final = create_output_layer(future_reshaped, 'index_ndx')
+    stock_output_final = create_output_layer(future_reshaped, 'stock')
+    news_sentiment_loss = Dense(1, activation='linear', name='news_sentiment_output')(text_output)
+    # 构建模型
+    model = Model(
+        inputs=[
+            text_input, pos_input, entity_input, sentiment_input,
+            index_inx_input, index_dj_input, index_ixic_input, index_ndx_input,
+            stock_input
+        ],
+        outputs=[
+            index_inx_output_final, index_dj_output_final, index_ixic_output_final,
+            index_ndx_output_final, stock_output_final
+        ]
+    )
+    # 优化器与学习率调度
+    lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
+        initial_learning_rate=0.0005,  # 初始学习率降低
+        decay_steps=10000,
+        alpha=0.1
+    )
+    optimizer = tf.keras.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=0.01)
+    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=[['mae', 'mse']] * 5)
+    return model

preprocess.py CHANGED Viewed

@@ -68,18 +68,20 @@ class LazyWord2Vec:
         self.model_path = model_path
         self._model = None
-    @property
-    def model(self):
         if self._model is None:
             print(f"Loading Word2Vec model from path: {self.model_path}...")
             self._model = KeyedVectors.load(self.model_path, mmap='r')
         return self._model
     @property
     def vector_size(self):
         self.load_model()
-        return self.model.vector_size  # 现在你可以正确访问 vector_size 属性
     def __getitem__(self, key):
         return self.model[key]

         self.model_path = model_path
         self._model = None
+    def load_model(self):
         if self._model is None:
             print(f"Loading Word2Vec model from path: {self.model_path}...")
             self._model = KeyedVectors.load(self.model_path, mmap='r')
+    @property
+    def model(self):
+        self.load_model()
         return self._model
     @property
     def vector_size(self):
         self.load_model()
+        return self.model.vector_size
     def __getitem__(self, key):
         return self.model[key]

us_stock.py CHANGED Viewed

@@ -166,11 +166,11 @@ def get_stock_history(symbol, news_date, retries=10):
     # 将news_date转换为datetime对象
-    news_date_dt = datetime.strptime(news_date, "%Y%m%d")
     # 计算start_date和end_date
-    start_date = (news_date_dt - timedelta(weeks=2)).strftime("%Y%m%d")
-    end_date = (news_date_dt + timedelta(weeks=2)).strftime("%Y%m%d")
     stock_hist_df = None
     retry_index = 0  # 初始化重试索引
@@ -244,6 +244,7 @@ def get_stock_history(symbol, news_date, retries=10):
 # result = get_stock_history('ATMU', '20231218')
 # print(result)
 # 返回个股所属指数历史数据
 def get_stock_index_history(symbol, news_date, force_index=0):
     # 检查股票所属的指数
@@ -264,12 +265,12 @@ def get_stock_index_history(symbol, news_date, force_index=0):
         index_code = ".IXIC"
         index_data = index_us_stock_index_IXIC
-    # 将 news_date 转换为 datetime 对象
-    news_date_dt = datetime.strptime(news_date, "%Y%m%d")
     # 计算 start_date 和 end_date
-    start_date = (news_date_dt - timedelta(weeks=8)).strftime("%Y-%m-%d")
-    end_date = (news_date_dt + timedelta(weeks=2)).strftime("%Y-%m-%d")
     # 确保 index_data['date'] 是 datetime 类型
     index_data['date'] = pd.to_datetime(index_data['date'])
@@ -333,7 +334,85 @@ def find_stock_codes_or_names(entities):
             pattern = rf'\b{re.escape(entity_lower)}\b'
             if re.search(pattern, name):
                 stock_codes.add(symbol.upper())
-                print(f"Matched name/company: '{entity_lower}' in '{name}' -> {symbol.upper()}")
     print(f"Stock codes found: {stock_codes}")
-    return list(stock_codes)

     # 将news_date转换为datetime对象
+    current_date = datetime.now()
     # 计算start_date和end_date
+    start_date = (current_date - timedelta(days=60)).strftime("%Y%m%d")
+    end_date = current_date.strftime("%Y%m%d")
     stock_hist_df = None
     retry_index = 0  # 初始化重试索引
 # result = get_stock_history('ATMU', '20231218')
 # print(result)
 # 返回个股所属指数历史数据
 def get_stock_index_history(symbol, news_date, force_index=0):
     # 检查股票所属的指数
         index_code = ".IXIC"
         index_data = index_us_stock_index_IXIC
+    # 获取当前日期
+    current_date = datetime.now()
     # 计算 start_date 和 end_date
+    start_date = (current_date - timedelta(weeks=8)).strftime("%Y-%m-%d")
+    end_date = current_date.strftime("%Y-%m-%d")
     # 确保 index_data['date'] 是 datetime 类型
     index_data['date'] = pd.to_datetime(index_data['date'])
             pattern = rf'\b{re.escape(entity_lower)}\b'
             if re.search(pattern, name):
                 stock_codes.add(symbol.upper())
+                #print(f"Matched name/company: '{entity_lower}' in '{name}' -> {symbol.upper()}")
     print(f"Stock codes found: {stock_codes}")
+    return list(stock_codes)
+def process_history(stock_history, target_date, history_days=30, following_days=3):
+    # 检查数据是否为空
+    if stock_history.empty:
+        return create_empty_data(history_days), create_empty_data(following_days)
+    # 确保日期列存在并转换为datetime格式
+    if 'date' not in stock_history.columns:
+        return create_empty_data(history_days), create_empty_data(following_days)
+    stock_history['date'] = pd.to_datetime(stock_history['date'])
+    target_date = pd.to_datetime(target_date)
+    # 按日期升序排序
+    stock_history = stock_history.sort_values('date')
+    # 找到目标日期对应的索引
+    target_row = stock_history[stock_history['date'] <= target_date]
+    if target_row.empty:
+        return create_empty_data(history_days), create_empty_data(following_days)
+    # 获取目标日期最近的行
+    target_index = target_row.index[-1]
+    target_pos = stock_history.index.get_loc(target_index)
+    # 获取历史数据（包括目标日期）
+    start_pos = max(0, target_pos - history_days + 1)
+    previous_rows = stock_history.iloc[start_pos:target_pos + 1]
+    # 获取后续数据
+    following_rows = stock_history.iloc[target_pos + 1:target_pos + following_days + 1]
+    # 删除日期列并确保数据完整性
+    previous_rows = previous_rows.drop(columns=['date'])
+    following_rows = following_rows.drop(columns=['date'])
+    # 处理数据不足的情况
+    previous_rows = handle_insufficient_data(previous_rows, history_days)
+    following_rows = handle_insufficient_data(following_rows, following_days)
+    return previous_rows.iloc[:, :6], following_rows.iloc[:, :6]
+def create_empty_data(days):
+    return pd.DataFrame({
+        '开盘': [-1] * days,
+        '收盘': [-1] * days,
+        '最高': [-1] * days,
+        '最低': [-1] * days,
+        '成交量': [-1] * days,
+        '成交额': [-1] * days
+    })
+def handle_insufficient_data(data, required_days):
+    current_rows = len(data)
+    if current_rows < required_days:
+        missing_rows = required_days - current_rows
+        empty_data = create_empty_data(missing_rows)
+        return pd.concat([empty_data, data]).reset_index(drop=True)
+    return data
+if __name__ == "__main__":
+    # 测试函数
+    result = find_stock_entry('AAPL')
+    print(f"find_stock_entry: {result}")
+    result = get_stock_history('AAPL', '20240214')
+    print(f"get_stock_history: {result}")
+    result = get_stock_index_history('AAPL', '20240214')
+    print(f"get_stock_index_history: {result}")
+    result = find_stock_codes_or_names([('苹果', 'ORG'), ('苹果公司', 'ORG')])
+    print(f"find_stock_codes_or_names: {result}")
+    result = process_history(get_stock_history('AAPL', '20240214'), '20240214')
+    print(f"process_history: {result}")
+    result = process_history(get_stock_index_history('AAPL', '20240214'), '20240214')
+    print(f"process_history: {result}")
+    pass