Files
OfficeFileHandle/xlsx2csv/csv_output/1.py
ChuXun 8d4419b1a0 1
2025-12-29 03:11:16 +08:00

96 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pandas as pd
import numpy as np
import re
def clean_male_fetus_data(file_path):
"""
读取并清洗您的 '男胎数据.csv',提取关键特征用于建模。
"""
try:
# 1. 读取数据
df = pd.read_csv(file_path)
print(f"原始数据行数: {len(df)}")
# 2. 列名映射 (根据您文件中的实际表头)
col_map = {
'孕妇代码': 'ID',
'年龄': 'Age',
'身高': 'Height',
'体重': 'Weight',
'检测孕周': 'GA_Raw', # 原始的 "11w+6" 格式
'孕妇BMI': 'BMI',
'Y染色体浓度': 'Y_Conc', # 关键列
'检测日期': 'TestTime'
}
# 重命名列,未在字典中的列保持原名
df_clean = df.rename(columns=col_map)
# 3. 清洗孕周 (解析 "11w+6" 为 11.857 周)
def parse_ga(s):
if pd.isna(s): return np.nan
s = str(s).strip().lower()
try:
weeks = 0
days = 0
# 如果是纯数字 (如 "12.5")
if re.match(r'^\d+(\.\d+)?$', s):
return float(s)
# 提取周数 (如 "11w")
if 'w' in s:
parts = s.split('w')
weeks = float(parts[0])
# 提取天数 (如 "+6" 或 "6d")
if len(parts) > 1 and parts[1]:
d_part = parts[1]
# 提取其中的数字
d_match = re.search(r'(\d+)', d_part)
if d_match:
days = float(d_match.group(1))
if weeks == 0 and days == 0:
return np.nan
return weeks + days / 7.0
except:
return np.nan
df_clean['GA'] = df_clean['GA_Raw'].apply(parse_ga)
# 4. 数值转换
numeric_cols = ['Age', 'Height', 'Weight', 'BMI', 'GA', 'Y_Conc']
for col in numeric_cols:
if col in df_clean.columns:
df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
# 5. 过滤有效数据
# 必须包含: 孕周, BMI, Y浓度
# 过滤掉物理上不可能的数值 (异常值处理)
df_male = df_clean.dropna(subset=['GA', 'BMI', 'Y_Conc']).copy()
df_male = df_male[
(df_male['BMI'] > 10) & (df_male['BMI'] < 60) & # 合理BMI范围
(df_male['GA'] > 0) & (df_male['GA'] < 45) & # 合理孕周范围
(df_male['Y_Conc'] > 0) # Y浓度必须存在
]
print(f"清洗完成,有效样本数: {len(df_male)}")
return df_male
except Exception as e:
print(f"数据清洗发生错误: {e}")
return None
# ==========================================
# 运行清洗
# ==========================================
file_name = '男胎数据.csv' # 您的文件名
cleaned_data = clean_male_fetus_data(file_name)
if cleaned_data is not None:
# 显示前5行检查结果
print(cleaned_data[['ID', 'Age', 'BMI', 'GA_Raw', 'GA', 'Y_Conc']].head())
# 保存结果
# cleaned_data.to_csv('cleaned_male_data.csv', index=False)