Spaces:

coycs
/

mlp

Sleeping

File size: 20,641 Bytes

from typing import List
from pydantic import BaseModel
from fastapi import FastAPI, Response, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware  # 跨域
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preproc
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import io
import json
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('AGG')


app = FastAPI()

# set cross-domain whitelist
origins = [
    "http://127.0.0.1:5500",
    "http://localhost:8081",
    "http://mlca.coycs.com",
    "https://mlca.coycs.com",
    "http://celadon-lebkuchen-cc4bb0.netlify.app",
    "https://celadon-lebkuchen-cc4bb0.netlify.app"
]
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["POST", "GET"],
    allow_headers=["*"],
)

# 工具函数


def json2df(json):
    # 字符串转数值
    def str2num(x):
        if isinstance(x, str):
            return eval(x)
        else:
            return x
    df = pd.DataFrame(json)
    # 空白符转None，且是"None"让eval能解析成功
    df.replace(to_replace=r"^\s*$", value="None", regex=True, inplace=True)
    # 科学计数法转数值
    df = df.applymap(str2num)
    return df


# def process_abnormal(df, detect, method):  # 异常值处理
#     if detect == 1:  # IQR检测方式
#         for coloum in df.columns:
#             q1 = df[coloum].quantile(0.75)
#             q3 = df[coloum].quantile(0.25)
#             iqr = q1-q3
#             if method == 1:  # 删除异常值
#                 df.drop(
#                     df.loc[lambda x:x[coloum] > q1 + 1.5 * iqr].index, inplace=True)
#                 df.drop(
#                     df.loc[lambda x:x[coloum] < q3 - 1.5 * iqr].index, inplace=True)
#             elif method == 2:  # 均值替换
#                 df.loc[lambda x:x[coloum] > q1 + 1.5 *
#                        iqr, coloum]=df[coloum].mean()
#                 df.loc[lambda x:x[coloum] < q3 - 1.5 *
#                        iqr, coloum]=df[coloum].mean()
#             elif method == 3:  # 中位数替换
#                 df.loc[lambda x:x[coloum] > q1 + 1.5 *
#                        iqr, coloum]=df[coloum].median()
#                 df.loc[lambda x:x[coloum] < q3 - 1.5 *
#                        iqr, coloum]=df[coloum].median()
#             elif method == 4:  # 众数替换
#                 df.loc[lambda x:x[coloum] > q1 + 1.5 *
#                        iqr, coloum]=df[coloum].mode().iloc[0]
#                 df.loc[lambda x:x[coloum] < q3 - 1.5 *
#                        iqr, coloum]=df[coloum].mode().iloc[0]
#             elif method == 5:  # 边界替换
#                 df.loc[lambda x:x[coloum] > q1 +
#                        1.5 * iqr, coloum]=q1 + 1.5 * iqr
#                 df.loc[lambda x:x[coloum] < q3 -
#                        1.5 * iqr, coloum]=q3 - 1.5 * iqr
#     elif detect == 2:  # Z-score检测方式
#         for coloum in df.columns:
#             mean = df[coloum].mean()
#             std = df[coloum].std()
#             df.drop(
#                 df.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True)
#             df.drop(
#                 df.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True)
#             if method == 1:  # 删除异常值
#                 df.drop(
#                     df.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True)
#                 df.drop(
#                     df.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True)
#             elif method == 2:  # 均值替换
#                 df.loc[lambda x:x[coloum] > mean +
#                        3 * std, coloum]=df[coloum].mean()
#                 df.loc[lambda x:x[coloum] < mean -
#                        3 * std, coloum]=df[coloum].mean()
#             elif method == 3:  # 中位数替换
#                 df.loc[lambda x:x[coloum] > mean + 3 *
#                        std, coloum]=df[coloum].median()
#                 df.loc[lambda x:x[coloum] < mean - 3 *
#                        std, coloum]=df[coloum].median()
#             elif method == 4:  # 众数替换
#                 df.loc[lambda x:x[coloum] > mean + 3 *
#                        std, coloum]=df[coloum].mode().iloc[0]
#                 df.loc[lambda x:x[coloum] < mean - 3 *
#                        std, coloum]=df[coloum].mode().iloc[0]
#             elif method == 5:  # 边界替换
#                 df.loc[lambda x:x[coloum] > mean +
#                        3 * std, coloum]=mean + 3 * std
#                 df.loc[lambda x:x[coloum] < mean -
#                        3 * std, coloum]=mean - 3 * std
#     return df


def process_miss(df, method):  # 缺失值处理
    # 舍弃全为空的行
    df = df.dropna(how='all')
    # 舍弃全为空的列
    df = df.dropna(axis=1, how='all')
    if method == 1:  # 均值
        df = df.fillna(df.mean())
    elif method == 2:  # 中位数
        df = df.fillna(df.median())
    elif method == 3:  # 众数
        df = df.fillna(df.mode().iloc[0])
    elif method == 4:  # 线性
        df = df.fillna(df.interpolate(
            method='linear', limit_direction='forward', axis=0))
    elif method == 5:  # 前值
        df = df.fillna(method="ffill")
    elif method == 6:  # 后值
        df = df.fillna(method="bfill")
    return df


def process_abnormal(df_inside, df_user, detect, method):  # 异常值处理
    df = pd.concat([df_inside, df_user], axis=0,
                   ignore_index=True)  # 合并的dataframe
    df_features = df.iloc[:, :12]  # 取所有的特征列为dataframe
    # print(df)

    if detect == 1:  # IQR检测方式
        for coloum in df_features.columns:
            q1 = df_features[coloum].quantile(0.75)
            q3 = df_features[coloum].quantile(0.25)
            iqr = q1-q3
            if method == 1:  # 删除异常值
                df_features.drop(
                    df_features.loc[lambda x:x[coloum] > q1 + 1.5 * iqr].index, inplace=True)
                df_features.drop(
                    df_features.loc[lambda x:x[coloum] < q3 - 1.5 * iqr].index, inplace=True)
            elif method == 2:  # 均值替换
                df_features.loc[lambda x:x[coloum] > q1 + 1.5 *
                                iqr, coloum]=df_features[coloum].mean()
                df_features.loc[lambda x:x[coloum] < q3 - 1.5 *
                                iqr, coloum]=df_features[coloum].mean()
            elif method == 3:  # 中位数替换
                df_features.loc[lambda x:x[coloum] > q1 + 1.5 *
                                iqr, coloum]=df_features[coloum].median()
                df_features.loc[lambda x:x[coloum] < q3 - 1.5 *
                                iqr, coloum]=df_features[coloum].median()
            elif method == 4:  # 众数替换
                df_features.loc[lambda x:x[coloum] > q1 + 1.5 *
                                iqr, coloum]=df_features[coloum].mode().iloc[0]
                df_features.loc[lambda x:x[coloum] < q3 - 1.5 *
                                iqr, coloum]=df_features[coloum].mode().iloc[0]
            elif method == 5:  # 边界替换
                df_features.loc[lambda x:x[coloum] > q1 +
                                1.5 * iqr, coloum]=q1 + 1.5 * iqr
                df_features.loc[lambda x:x[coloum] < q3 -
                                1.5 * iqr, coloum]=q3 - 1.5 * iqr
    elif detect == 2:  # Z-score检测方式
        for coloum in df_features.columns:
            mean = df_features[coloum].mean()
            std = df_features[coloum].std()
            df_features.drop(
                df_features.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True)
            df_features.drop(
                df_features.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True)
            if method == 1:  # 删除异常值
                df_features.drop(
                    df_features.loc[lambda x:x[coloum] > mean + 3 * std].index, inplace=True)
                df_features.drop(
                    df_features.loc[lambda x:x[coloum] < mean - 3 * std].index, inplace=True)
            elif method == 2:  # 均值替换
                df_features.loc[lambda x:x[coloum] > mean +
                                3 * std, coloum]=df_features[coloum].mean()
                df_features.loc[lambda x:x[coloum] < mean -
                                3 * std, coloum]=df_features[coloum].mean()
            elif method == 3:  # 中位数替换
                df_features.loc[lambda x:x[coloum] > mean + 3 *
                                std, coloum]=df_features[coloum].median()
                df_features.loc[lambda x:x[coloum] < mean - 3 *
                                std, coloum]=df_features[coloum].median()
            elif method == 4:  # 众数替换
                df_features.loc[lambda x:x[coloum] > mean + 3 *
                                std, coloum]=df_features[coloum].mode().iloc[0]
                df_features.loc[lambda x:x[coloum] < mean - 3 *
                                std, coloum]=df_features[coloum].mode().iloc[0]
            elif method == 5:  # 边界替换
                df_features.loc[lambda x:x[coloum] > mean +
                                3 * std, coloum]=mean + 3 * std
                df_features.loc[lambda x:x[coloum] < mean -
                                3 * std, coloum]=mean - 3 * std

    df.iloc[:, :12] = df_features
    df_inside = df.iloc[:df_inside.shape[0], :]
    df_user = df.iloc[df_inside.shape[0]:, :12]
    return {"df_inside": df_inside, "df_user": df_user}


def process_standard(df_inside, df_user, method):  # 标准化处理
    df = pd.concat([df_inside, df_user], axis=0,
                   ignore_index=True)  # 合并的dataframe
    df_features = df.iloc[:, :12]  # 取所有的特征列为dataframe
    columns = df_features.columns  # 列名

    if method == 1:  # Min-max
        df_features = preproc.minmax_scale(df_features)
    elif method == 2:  # Z-Score
        df_features = preproc.StandardScaler().fit_transform(df_features)
    elif method == 3:  # MaxAbs
        df_features = preproc.maxabs_scale(df_features, axis=0)
    elif method == 4:  # RobustScaler
        df_features = preproc.RobustScaler().fit_transform(df_features)
    elif method == 5:  # 正则化
        df_features = preproc.normalize(df_features, axis=0)
    df_features = pd.DataFrame(
        data=df_features[0:, 0:], columns=columns)  # 补充列名

    df.iloc[:, :12] = df_features
    df_inside = df.iloc[:df_inside.shape[0], :]
    df_user = df.iloc[df_inside.shape[0]:, :12]
    return {"df_inside": df_inside, "df_user": df_user}


def train_model(x, y, test_size, algorithm, paras):  # 模型训练
   # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, random_state=0)
    # 机器学习
    model = None
    results = {}
    if algorithm == 1:  # 最小二乘法线性回归
        model = LinearRegression(fit_intercept=paras["fit_intercept"])
    if algorithm == 2:  # 随机森林回归
        model = RandomForestRegressor(n_estimators=paras["n_estimators"],
                                      criterion=paras["criterion"], max_depth=paras["max_depth"], random_state=0)
    if algorithm == 3:  # BP神经网络回归
        model = MLPRegressor(hidden_layer_sizes=(paras["hidden_layer_sizes_1"], paras["hidden_layer_sizes_2"]),
                             activation=paras["activation"], solver='lbfgs', random_state=paras["random_state"])
    if algorithm == 4:  # XGBoost回归
        model = XGBRegressor(
            max_depth=paras["max_depth"], learning_rate=paras["learning_rate"], n_estimators=paras["n_estimators"])
    if algorithm == 5:  # LightGBM回归
        # model = lgb.LGBMRegressor(objective='regression',boosting_type="dart",num_leaves=30, max_depth=-1,n_estimators=20,learning_rate=1)
        model = lgb.LGBMRegressor(objective='regression', max_depth=paras["max_depth"],
                                  learning_rate=paras["learning_rate"], random_state=paras["random_state"], n_estimators=paras["n_estimators"])

    # 返回数据
    if model != None:
        model.fit(x_train, y_train)
        if algorithm == 1:  # 最小二乘法线性回归
            # 保留小数点后三位
            # results["coef"] = model.coef_.tolist()  # 模型斜率
            results["coef"] = [float('{:.4f}'.format(i))
                               for i in model.coef_.tolist()]  # 模型斜率
            results["intercept"] = round(model.intercept_, 3)  # 模型截距
        y_pred = model.predict(x_test)  # 预测值
        # y_test = y_test.values
        # 误差，用round保留三位小数且四舍五入
        mae = round(mean_absolute_error(y_test, y_pred), 3)
        rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)), 3)
        r2 = round(r2_score(y_test, y_pred), 3)
        # y_test = [x[0] for x in np.array(y_test).tolist()]
        # y_pred = [x[0] for x in y_pred.tolist()]
        y_test = np.array(y_test).tolist()
        y_pred = y_pred.tolist()

        res = {"y_test": y_test, "y_pred": y_pred, "error": {
            "MAE": mae, "RMSE": rmse, "R2": r2}, "results": results}
        print(res)
        
        return res
        # return {"y_test": y_test, "y_pred": y_pred, "error": {"MAE": mae, "RMSE": rmse, "R2": r2}, "results": results}
    else:
        return "模型训练出错"


def predict_connectivity(x, x1, y, test_size, algorithm, paras):
    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, random_state=0)
    # 机器学习
    model = None
    results = {}
    if algorithm == 1:  # 最小二乘法线性回归
        model = LinearRegression(fit_intercept=paras["fit_intercept"])
    if algorithm == 2:  # 随机森林回归
        model = RandomForestRegressor(n_estimators=paras["n_estimators"],
                                      criterion=paras["criterion"], max_depth=paras["max_depth"], random_state=0)
    if algorithm == 3:  # BP神经网络回归
        model = MLPRegressor(hidden_layer_sizes=(paras["hidden_layer_sizes_1"], paras["hidden_layer_sizes_2"]),
                             activation=paras["activation"], solver='lbfgs', random_state=paras["random_state"])
    if algorithm == 4:  # XGBoost回归
        model = XGBRegressor(
            max_depth=paras["max_depth"], learning_rate=paras["learning_rate"], n_estimators=paras["n_estimators"])
    if algorithm == 5:  # LightGBM回归
        model = lgb.LGBMRegressor(objective='regression', max_depth=paras["max_depth"],
                                  learning_rate=paras["learning_rate"], random_state=paras["random_state"], n_estimators=paras["n_estimators"])
    # 返回数据
    if model != None:
        model.fit(x_train, y_train)
        y_pred = model.predict(x1).tolist()  # 预测值
        return y_pred
    else:
        return "预测连通性出错"

# 登录验证


class Login(BaseModel):  # 接口数据类型
    username: str
    password: str


@app.post("/login")  # 接口
async def login(login: Login):
    username = login.username
    password = login.password
    if username == "admin" and password == "123456":
        return True
    return False


# 处理用户数据

class Process_user(BaseModel):  # 接口数据类型
    mode: int
    data: List
    miss: List
    abnormal: List
    standard: List


@app.post("/process/user")  # 接口
async def process_user(user: Process_user):
    mode = user.mode  # 选择的井间连通模式
    df_inside = pd.read_csv(
        "./mode_{}.csv".format(mode)).dropna(axis=0)  # 连通模式对应的内置数据
    df_user = json2df(user.data)
    abnormal = user.abnormal[0]
    miss = user.miss[0]
    standard = user.standard[0]
    # 异常值处理
    if abnormal["state"]:
        abnormaled = process_abnormal(
            df_inside, df_user, abnormal["detect"], abnormal["method"])
        df_inside = abnormaled["df_inside"]
        df_user = abnormaled["df_user"]
    # 缺失值处理
    if miss["state"]:
        df_user = process_miss(df_user, miss["method"])
    # 标准化处理
    if standard["state"]:
        standarded = process_standard(df_inside, df_user, standard["method"])
        df_inside = standarded["df_inside"]
        df_user = standarded["df_user"]
    # 用astype将数值转科学计数法
    return {"inside": df_inside.astype('str').to_json(orient='records'), "user": df_user.astype('str').to_json(orient='records')}

    # # 用astype将数值转科学计数法
    # return df.astype('str').to_json(orient='records')

# 处理内置数据


class Process_inside(BaseModel):  # 接口数据类型
    data: List
    abnormal: List
    standard: List


@app.post("/process/inside")  # 接口
async def process_inside(inside: Process_inside):
    df = json2df(inside.data)
    abnormal = inside.abnormal[0]
    standard = inside.standard[0]
    # 异常值处理
    if abnormal["state"]:
        df = process_abnormal(df, abnormal["detect"], abnormal["method"])
    # 标准化处理：只对特征进行标准化，不包括标签（后三列）
    if standard["state"]:
        df = pd.concat([process_standard(df.iloc[:, :12],
                        standard["method"]), df.iloc[:, 12:]], axis=1)
    # 用astype将数值转科学计数法
    return df.astype('str').to_json(orient='records')


# 训练模型
class Train(BaseModel):  # 接口数据类型
    data: List
    test_size: float
    algorithm: int
    paras: List


@app.post("/train")  # 接口
async def train(train: Train):
    # 解析数据
    df = json2df(train.data)
    test_size = train.test_size
    algorithm = train.algorithm
    paras = train.paras[0]
    x = df.iloc[:, :12]
    y1 = df.loc[:, "BSR"]
    y2 = df.loc[:, "SBR"]
    y3 = df.loc[:, "D"]
    bsr = train_model(x, y1, test_size, algorithm, paras)
    sbr = train_model(x, y2, test_size, algorithm, paras)
    x_train, x_test, y_train, y_test = train_test_split(
        x, y3, test_size=test_size, random_state=0)
    d = {"y_test": np.array(y_test).tolist(), "y_pred": np.sum(
        [bsr["y_pred"], sbr["y_pred"]], axis=0).tolist()}
    return {"bsr": bsr, "sbr": sbr,  "d": d}

# 预测连通性


class Predict(BaseModel):  # 接口数据类型
    data_train: List
    data_predict: List
    test_size: float
    algorithm: int
    paras: List


@app.post("/predict")  # 接口
async def predict(predict: Predict):
    # 解析数据
    df_train = json2df(predict.data_train)
    df_predict = json2df(predict.data_predict)
    test_size = predict.test_size
    algorithm = predict.algorithm
    paras = predict.paras[0]
    x = df_train.iloc[:, :12]
    y1 = df_train.loc[:, "BSR"]
    y2 = df_train.loc[:, "SBR"]
    # 预测连通性
    bsr = predict_connectivity(x, df_predict, y1, test_size, algorithm, paras)
    sbr = predict_connectivity(x, df_predict, y2, test_size, algorithm, paras)
    d = np.sum([bsr, sbr], axis=0).tolist()
    # 合并为一个list后转dataframe再转json实现前端表格数据格式
    data = []
    data.append(bsr)
    data.append(sbr)
    data.append(d)
    df_result = pd.concat([pd.DataFrame(predict.data_predict), pd.DataFrame(data=np.array(
        data).T.tolist(), columns=["BSR", "SBR",  "D"])], axis=1)
    return df_result.to_json(orient='records')
    # return pd.DataFrame(data=np.array(data).T.tolist(), columns=["BSR", "SBR",  "D"]).to_json(orient='records')


# # 图片测试

# def create_img():
#     plt.rcParams['figure.figsize'] = [7.50, 3.50]
#     plt.rcParams['figure.autolayout'] = True
#     plt.plot([1, 2])
#     img_buf = io.BytesIO()
#     plt.savefig(img_buf, format='png')
#     plt.close()
#     return img_buf


# @app.get('/png')
# async def get_img(background_tasks: BackgroundTasks):
#     img_buf = create_img()
#     # get the entire buffer content
#     # because of the async, this will await the loading of all content
#     bufContents: bytes = img_buf.getvalue()
#     background_tasks.add_task(img_buf.close)
#     headers = {'Content-Disposition': 'inline; filename="out.png"'}
#     return Response(bufContents, headers=headers, media_type='image/png')