mirror of
https://github.com/ChuXunYu/OfficeFileHandle.git
synced 2026-01-31 09:01:25 +00:00
96 lines
3.3 KiB
Python
96 lines
3.3 KiB
Python
import pandas as pd
|
||
import numpy as np
|
||
import re
|
||
|
||
def clean_male_fetus_data(file_path):
|
||
"""
|
||
读取并清洗您的 '男胎数据.csv',提取关键特征用于建模。
|
||
"""
|
||
try:
|
||
# 1. 读取数据
|
||
df = pd.read_csv(file_path)
|
||
print(f"原始数据行数: {len(df)}")
|
||
|
||
# 2. 列名映射 (根据您文件中的实际表头)
|
||
col_map = {
|
||
'孕妇代码': 'ID',
|
||
'年龄': 'Age',
|
||
'身高': 'Height',
|
||
'体重': 'Weight',
|
||
'检测孕周': 'GA_Raw', # 原始的 "11w+6" 格式
|
||
'孕妇BMI': 'BMI',
|
||
'Y染色体浓度': 'Y_Conc', # 关键列
|
||
'检测日期': 'TestTime'
|
||
}
|
||
|
||
# 重命名列,未在字典中的列保持原名
|
||
df_clean = df.rename(columns=col_map)
|
||
|
||
# 3. 清洗孕周 (解析 "11w+6" 为 11.857 周)
|
||
def parse_ga(s):
|
||
if pd.isna(s): return np.nan
|
||
s = str(s).strip().lower()
|
||
try:
|
||
weeks = 0
|
||
days = 0
|
||
|
||
# 如果是纯数字 (如 "12.5")
|
||
if re.match(r'^\d+(\.\d+)?$', s):
|
||
return float(s)
|
||
|
||
# 提取周数 (如 "11w")
|
||
if 'w' in s:
|
||
parts = s.split('w')
|
||
weeks = float(parts[0])
|
||
# 提取天数 (如 "+6" 或 "6d")
|
||
if len(parts) > 1 and parts[1]:
|
||
d_part = parts[1]
|
||
# 提取其中的数字
|
||
d_match = re.search(r'(\d+)', d_part)
|
||
if d_match:
|
||
days = float(d_match.group(1))
|
||
|
||
if weeks == 0 and days == 0:
|
||
return np.nan
|
||
|
||
return weeks + days / 7.0
|
||
except:
|
||
return np.nan
|
||
|
||
df_clean['GA'] = df_clean['GA_Raw'].apply(parse_ga)
|
||
|
||
# 4. 数值转换
|
||
numeric_cols = ['Age', 'Height', 'Weight', 'BMI', 'GA', 'Y_Conc']
|
||
for col in numeric_cols:
|
||
if col in df_clean.columns:
|
||
df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
|
||
|
||
# 5. 过滤有效数据
|
||
# 必须包含: 孕周, BMI, Y浓度
|
||
# 过滤掉物理上不可能的数值 (异常值处理)
|
||
df_male = df_clean.dropna(subset=['GA', 'BMI', 'Y_Conc']).copy()
|
||
df_male = df_male[
|
||
(df_male['BMI'] > 10) & (df_male['BMI'] < 60) & # 合理BMI范围
|
||
(df_male['GA'] > 0) & (df_male['GA'] < 45) & # 合理孕周范围
|
||
(df_male['Y_Conc'] > 0) # Y浓度必须存在
|
||
]
|
||
|
||
print(f"清洗完成,有效样本数: {len(df_male)}")
|
||
return df_male
|
||
|
||
except Exception as e:
|
||
print(f"数据清洗发生错误: {e}")
|
||
return None
|
||
|
||
# ==========================================
|
||
# 运行清洗
|
||
# ==========================================
|
||
file_name = '男胎数据.csv' # 您的文件名
|
||
cleaned_data = clean_male_fetus_data(file_name)
|
||
|
||
if cleaned_data is not None:
|
||
# 显示前5行,检查结果
|
||
print(cleaned_data[['ID', 'Age', 'BMI', 'GA_Raw', 'GA', 'Y_Conc']].head())
|
||
|
||
# 保存结果
|
||
# cleaned_data.to_csv('cleaned_male_data.csv', index=False) |