import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.formula.api as smfProject
[Project Title] 약물 효율성과 안전성 평가를 위한 RWD 분석 및 RWE 생성
[Project Overview] - 10,000명 이상 환자 데이터를 활용한 치료 효과 및 안전성 평가를 위한 통계 분석 수행
[My role] - 연구 계획서 기반 통계 분석 계획서/보고서SAP 작성 - SDTM datasets 기반 ADaM datasets 생성 - 기술 통계 및 추론 통계 분석 수행 - 연구 결과 보고서를 위한 TFLs 생성
[Analysis Workflow Diagram] - 분석 데이터 준비 → 통계 분석 → 시각화/TFLs 생성 → 통계 분석 보고서 작성
[Tools] SAS | R
Import
Data
np.random.seed(1212)
n = 30
visit1 = np.random.normal(100, 10, n)
visit2 = visit1 + np.random.normal(-5, 5, n)
visit3 = visit1 + np.random.normal(-10, 5, n)df = pd.DataFrame({
'Subject': np.arange(1, n + 1),
'Visit1': visit1,
'Visit2': visit2,
'Visit3': visit3
})
df['Change_V2_V1'] = df['Visit2'] - df['Visit1']
df['Change_V3_V1'] = df['Visit3'] - df['Visit1']Function
Same as
Univariate of SAS
def summary_stats(data):
mean = np.mean(data)
sd = np.std(data, ddof=1)
se = stats.sem(data)
ci_low, ci_high = stats.t.interval(0.95, len(data)-1, loc=mean, scale=se)
return round(mean,2), round(sd,2), round(se,2), len(data), f"({round(ci_low,2)}, {round(ci_high,2)})"visit_summary = pd.DataFrame([summary_stats(df[col]) for col in ['Visit1', 'Visit2', 'Visit3']],
columns=['Mean', 'SD', 'SE', 'N', '95% CI'],
index=['Visit1', 'Visit2', 'Visit3'])visit_summary| Mean | SD | SE | N | 95% CI | |
|---|---|---|---|---|---|
| Visit1 | 99.67 | 7.91 | 1.44 | 30 | (96.72, 102.62) |
| Visit2 | 94.52 | 9.08 | 1.66 | 30 | (91.13, 97.91) |
| Visit3 | 88.33 | 9.74 | 1.78 | 30 | (84.69, 91.96) |
change_summary = pd.DataFrame([summary_stats(df['Change_V2_V1']),
summary_stats(df['Change_V3_V1'])],
columns=['Mean', 'SD', 'SE', 'N', '95% CI'],
index=['Visit2 - Visit1', 'Visit3 - Visit1'])change_summary| Mean | SD | SE | N | 95% CI | |
|---|---|---|---|---|---|
| Visit2 - Visit1 | -5.15 | 4.81 | 0.88 | 30 | (-6.95, -3.36) |
| Visit3 - Visit1 | -11.35 | 5.22 | 0.95 | 30 | (-13.29, -9.4) |
Comparision with SAS
PROC TTEST DATA=mydata;
PAIRED Visit2*Visit1;
RUN;
change_summary['Paired t-test p'] = [round(stats.ttest_rel(df['Visit2'], df['Visit1']).pvalue, 4),
round(stats.ttest_rel(df['Visit3'], df['Visit1']).pvalue, 4)]change_summary| Mean | SD | SE | N | 95% CI | Paired t-test p | |
|---|---|---|---|---|---|---|
| Visit2 - Visit1 | -5.15 | 4.81 | 0.88 | 30 | (-6.95, -3.36) | 0.0 |
| Visit3 - Visit1 | -11.35 | 5.22 | 0.95 | 30 | (-13.29, -9.4) | 0.0 |
PROC TTEST DATA=mydata H0=0;
VAR change_value;
RUN;
change_summary['One-sample t-test p'] = [round(stats.ttest_1samp(df['Change_V2_V1'], 0).pvalue, 4),
round(stats.ttest_1samp(df['Change_V3_V1'], 0).pvalue, 4)]change_summary| Mean | SD | SE | N | 95% CI | Paired t-test p | One-sample t-test p | |
|---|---|---|---|---|---|---|---|
| Visit2 - Visit1 | -5.15 | 4.81 | 0.88 | 30 | (-6.95, -3.36) | 0.0 | 0.0 |
| Visit3 - Visit1 | -11.35 | 5.22 | 0.95 | 30 | (-13.29, -9.4) | 0.0 | 0.0 |
df_long = pd.melt(df, id_vars='Subject', value_vars=['Visit1', 'Visit2', 'Visit3'],
var_name='Visit', value_name='Score')df_long| Subject | Visit | Score | |
|---|---|---|---|
| 0 | 1 | Visit1 | 107.373521 |
| 1 | 2 | Visit1 | 93.827029 |
| 2 | 3 | Visit1 | 104.629304 |
| 3 | 4 | Visit1 | 96.558097 |
| 4 | 5 | Visit1 | 109.795635 |
| ... | ... | ... | ... |
| 85 | 26 | Visit3 | 98.156855 |
| 86 | 27 | Visit3 | 87.828647 |
| 87 | 28 | Visit3 | 87.818170 |
| 88 | 29 | Visit3 | 95.441655 |
| 89 | 30 | Visit3 | 72.320460 |
90 rows × 3 columns
df_long['Visit'] = df_long['Visit'].map({'Visit1': 1, 'Visit2': 2, 'Visit3': 3})
df_long| Subject | Visit | Score | |
|---|---|---|---|
| 0 | 1 | 1 | 107.373521 |
| 1 | 2 | 1 | 93.827029 |
| 2 | 3 | 1 | 104.629304 |
| 3 | 4 | 1 | 96.558097 |
| 4 | 5 | 1 | 109.795635 |
| ... | ... | ... | ... |
| 85 | 26 | 3 | 98.156855 |
| 86 | 27 | 3 | 87.828647 |
| 87 | 28 | 3 | 87.818170 |
| 88 | 29 | 3 | 95.441655 |
| 89 | 30 | 3 | 72.320460 |
90 rows × 3 columns
Mixed Model
Same as Proc
model = smf.mixedlm("Score ~ Visit", df_long, groups=df_long["Subject"]).fit()
model<statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x7f9ddb066a90>
model.pvalues['Visit']9.562865213858627e-24
mixed_model_pvalue = round(model.pvalues['Visit'], 4)
mixed_model_pvalue0.0
visit_summary['Type'] = 'Visit'
change_summary['Type'] = 'Change'summary_combined = pd.concat([visit_summary, change_summary])
summary_combined['Repeated Mixed Model p'] = ''
summary_combined.loc['Visit1', 'Repeated Mixed Model p'] = mixed_model_pvaluesummary_combined.reset_index(inplace=True)
summary_combined.rename(columns={'index': 'Comparison'}, inplace=True)summary_combined| Comparison | Mean | SD | SE | N | 95% CI | Type | Paired t-test p | One-sample t-test p | Repeated Mixed Model p | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Visit1 | 99.67 | 7.91 | 1.44 | 30 | (96.72, 102.62) | Visit | NaN | NaN | 0.0 |
| 1 | Visit2 | 94.52 | 9.08 | 1.66 | 30 | (91.13, 97.91) | Visit | NaN | NaN | |
| 2 | Visit3 | 88.33 | 9.74 | 1.78 | 30 | (84.69, 91.96) | Visit | NaN | NaN | |
| 3 | Visit2 - Visit1 | -5.15 | 4.81 | 0.88 | 30 | (-6.95, -3.36) | Change | 0.0 | 0.0 | |
| 4 | Visit3 - Visit1 | -11.35 | 5.22 | 0.95 | 30 | (-13.29, -9.4) | Change | 0.0 | 0.0 |