mirror of
https://github.com/ChuXunYu/OfficeFileHandle.git
synced 2026-01-31 02:01:26 +00:00
77 lines
2.9 KiB
Python
77 lines
2.9 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.gaussian_process import GaussianProcessRegressor
|
|
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_squared_error, r2_score
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.preprocessing import PolynomialFeatures
|
|
|
|
try:
|
|
# 1. 加载数据
|
|
df0 = pd.read_csv('input_file_0.csv')
|
|
df1 = pd.read_csv('input_file_1.csv')
|
|
df = pd.concat([df0, df1], axis=0, ignore_index=True)
|
|
|
|
# 清洗列名
|
|
df.columns = [c.strip() for c in df.columns]
|
|
|
|
# 筛选男胎
|
|
male_df = df[df['Y染色体浓度'].notna() & (df['Y染色体浓度'] > 0)].copy()
|
|
|
|
# 解析孕周
|
|
def parse_ga(s):
|
|
try:
|
|
if pd.isna(s): return np.nan
|
|
s = str(s).lower().replace('w', ' ').replace('+', ' ').split()
|
|
return float(s[0]) + (float(s[1])/7.0 if len(s) > 1 else 0)
|
|
except:
|
|
return np.nan
|
|
|
|
male_df['GA_numeric'] = male_df['检测孕周'].apply(parse_ga)
|
|
# 剔除缺失值
|
|
male_df = male_df.dropna(subset=['Y染色体浓度', 'GA_numeric', '孕妇BMI'])
|
|
|
|
# 2. 准备数据
|
|
X = male_df[['GA_numeric', '孕妇BMI']].values
|
|
y = male_df['Y染色体浓度'].values
|
|
|
|
# 数据集划分
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
|
|
# 3. 建立 GPR 模型
|
|
# 关键:数据标准化对 GPR 至关重要
|
|
scaler_X = StandardScaler()
|
|
X_train_scaled = scaler_X.fit_transform(X_train)
|
|
X_test_scaled = scaler_X.transform(X_test)
|
|
|
|
# Kernel设计: RBF(趋势) + WhiteKernel(噪声)
|
|
# 初始 length_scale 设为 1.0 (因为数据已标准化)
|
|
kernel = C(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.1)
|
|
|
|
# 训练 (n_restarts_optimizer=0 以节省时间,实际建模建议设为 5-10)
|
|
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=0, random_state=42)
|
|
gpr.fit(X_train_scaled, y_train)
|
|
|
|
# 4. 建立对比模型 (交互项回归)
|
|
poly = make_pipeline(
|
|
PolynomialFeatures(degree=2, interaction_only=True, include_bias=False),
|
|
LinearRegression()
|
|
)
|
|
poly.fit(X_train, y_train)
|
|
|
|
# 5. 评估
|
|
y_pred_gpr, y_std = gpr.predict(X_test_scaled, return_std=True)
|
|
y_pred_poly = poly.predict(X_test)
|
|
|
|
print("=== 模型性能大比拼 (Test Set) ===")
|
|
print(f"GPR RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_gpr)):.6f}")
|
|
print(f"Poly RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_poly)):.6f}")
|
|
print(f"GPR R2: {r2_score(y_test, y_pred_gpr):.4f}")
|
|
print(f"Poly R2: {r2_score(y_test, y_pred_poly):.4f}")
|
|
print(f"\nGPR Learned Kernel Params: {gpr.kernel_}")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}") |