import pandas as pd
import plotly.express as px
from scipy.stats import spearmanr, pearsonrReference
Import
Data
df = pd.read_csv('../../../delete/ScreenTime vs MentalWellness.csv').iloc[:, :-1]df| user_id | age | gender | occupation | work_mode | screen_time_hours | work_screen_hours | leisure_screen_hours | sleep_hours | sleep_quality_1_5 | stress_level_0_10 | productivity_0_100 | exercise_minutes_per_week | social_hours_per_week | mental_wellness_index_0_100 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | U0001 | 33 | Female | Employed | Remote | 10.79 | 5.44 | 5.35 | 6.63 | 1 | 9.3 | 44.7 | 127 | 0.7 | 9.3 |
| 1 | U0002 | 28 | Female | Employed | In-person | 7.40 | 0.37 | 7.03 | 8.05 | 3 | 5.7 | 78.0 | 74 | 2.1 | 56.2 |
| 2 | U0003 | 35 | Female | Employed | Hybrid | 9.78 | 1.09 | 8.69 | 6.48 | 1 | 9.1 | 51.8 | 67 | 8.0 | 3.6 |
| 3 | U0004 | 42 | Male | Employed | Hybrid | 11.13 | 0.56 | 10.57 | 6.89 | 1 | 10.0 | 37.0 | 0 | 5.7 | 0.0 |
| 4 | U0005 | 28 | Male | Student | Remote | 13.22 | 4.09 | 9.13 | 5.79 | 1 | 10.0 | 38.5 | 143 | 10.1 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 395 | U0396 | 26 | Female | Student | Remote | 6.43 | 2.99 | 3.44 | 7.75 | 1 | 5.9 | 64.6 | 252 | 7.6 | 39.3 |
| 396 | U0397 | 16 | Male | Self-employed | Remote | 9.59 | 5.44 | 4.15 | 5.57 | 1 | 10.0 | 47.4 | 99 | 7.0 | 3.5 |
| 397 | U0398 | 40 | Male | Student | Remote | 8.72 | 2.36 | 6.36 | 7.56 | 1 | 9.4 | 57.3 | 193 | 10.1 | 6.6 |
| 398 | U0399 | 29 | Female | Retired | Hybrid | 5.04 | 0.94 | 4.10 | 7.32 | 1 | 7.1 | 63.6 | 97 | 12.1 | 21.0 |
| 399 | U0400 | 39 | Female | Employed | Remote | 8.86 | 3.82 | 5.04 | 6.55 | 1 | 10.0 | 41.4 | 88 | 9.8 | 8.4 |
400 rows × 15 columns
screen_time_hours은work_screen_hours와leisure_screen_hours의 합임을 확인
(df["screen_time_hours"] == df["work_screen_hours"] + df["leisure_screen_hours"]).all()False
diff[diff>0]Series([], dtype: float64)
diff = round(df["screen_time_hours"] - (df["work_screen_hours"] + df["leisure_screen_hours"]),2)
df_diff = df[diff != 0]
df_diff| user_id | age | gender | occupation | work_mode | screen_time_hours | work_screen_hours | leisure_screen_hours | sleep_hours | sleep_quality_1_5 | stress_level_0_10 | productivity_0_100 | exercise_minutes_per_week | social_hours_per_week | mental_wellness_index_0_100 |
|---|
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 400 non-null object
1 age 400 non-null int64
2 gender 400 non-null object
3 occupation 400 non-null object
4 work_mode 400 non-null object
5 screen_time_hours 400 non-null float64
6 work_screen_hours 400 non-null float64
7 leisure_screen_hours 400 non-null float64
8 sleep_hours 400 non-null float64
9 sleep_quality_1_5 400 non-null int64
10 stress_level_0_10 400 non-null float64
11 productivity_0_100 400 non-null float64
12 exercise_minutes_per_week 400 non-null int64
13 social_hours_per_week 400 non-null float64
14 mental_wellness_index_0_100 400 non-null float64
dtypes: float64(8), int64(3), object(4)
memory usage: 47.0+ KB
- 문자형 컬럼
char_cols = df.select_dtypes(include=['object', 'string']).columns
for col in char_cols:
print(f"\n--- {col} (categorical) ---")
print(df[col].value_counts())
--- user_id (categorical) ---
U0001 1
U0264 1
U0274 1
U0273 1
U0272 1
..
U0131 1
U0130 1
U0129 1
U0128 1
U0400 1
Name: user_id, Length: 400, dtype: int64
--- gender (categorical) ---
Female 222
Male 170
Non-binary/Other 8
Name: gender, dtype: int64
--- occupation (categorical) ---
Employed 207
Student 107
Self-employed 45
Unemployed 27
Retired 14
Name: occupation, dtype: int64
--- work_mode (categorical) ---
Remote 150
Hybrid 146
In-person 104
Name: work_mode, dtype: int64
- 숫자형 컬럼
num_cols = df.select_dtypes(include=['number']).columns
print("\n--- Numeric summary ---")
print(df[num_cols].describe().T)
--- Numeric summary ---
count mean std min 25% \
age 400.0 29.777500 7.466080 16.00 24.0000
screen_time_hours 400.0 9.024900 2.491058 1.00 7.3725
work_screen_hours 400.0 2.183075 1.931321 0.11 0.6950
leisure_screen_hours 400.0 6.841825 2.220896 0.89 5.4600
sleep_hours 400.0 7.013175 0.852421 4.64 6.3975
sleep_quality_1_5 400.0 1.397500 0.652348 1.00 1.0000
stress_level_0_10 400.0 8.150500 2.094844 0.00 6.9000
productivity_0_100 400.0 54.306500 15.020054 20.60 43.6000
exercise_minutes_per_week 400.0 109.810000 70.007045 0.00 58.0000
social_hours_per_week 400.0 7.905000 4.909632 0.00 4.5750
mental_wellness_index_0_100 400.0 20.326750 20.376793 0.00 3.6750
50% 75% max
age 30.000 35.0000 60.00
screen_time_hours 9.090 10.4950 19.17
work_screen_hours 1.455 3.0125 12.04
leisure_screen_hours 6.700 8.4175 13.35
sleep_hours 7.030 7.6400 9.74
sleep_quality_1_5 1.000 2.0000 4.00
stress_level_0_10 8.800 10.0000 10.00
productivity_0_100 51.750 63.0000 100.00
exercise_minutes_per_week 103.000 157.0000 372.00
social_hours_per_week 7.750 11.0250 23.90
mental_wellness_index_0_100 14.800 30.6500 97.00
Visualization
Character col check
fig = px.bar(
df['gender'].value_counts().reset_index(),
x='index', y='gender',
labels={'index':'Gender', 'gender':'Count'},
title='Gender Distribution',
height=500, width= 700)
fig.show()fig = px.bar(
df['occupation'].value_counts().reset_index(),
x='index', y='occupation',
labels={'index':'occupation', 'occupation':'Count'},
title='Occupation Distribution',
height=500, width= 700)
fig.show()fig = px.bar(
df['work_mode'].value_counts().reset_index(),
x='index', y='work_mode',
labels={'index':'work_mode', 'work_mode':'Count'},
title='work_mode Distribution',
height=500, width= 700)
fig.show()Gender and Work mode
- Average Screen Time, work screen hours and leisure screen hours by Gender
성별 차이는 없어 보임
df_grouped = df.groupby("gender")[["screen_time_hours", "work_screen_hours","leisure_screen_hours"]].mean().reset_index()
fig = px.bar(
df_grouped,
x="gender",
y=["screen_time_hours", "work_screen_hours","leisure_screen_hours"],
barmode="group",
title="Average Screen Time by Gender",
labels={"value": "Hours", "gender": "Gender", "variable": "Type"},
width=700, height=500
)
fig.show()- Average Screen Time, work screen hours and leisure screen hours by work mode
remote로 하는 사람들이 당연히 work_xcreen_hours 가장 높았고 이에따라 screen time hours도 높았음
df_grouped = df.groupby("work_mode")[["screen_time_hours", "work_screen_hours","leisure_screen_hours"]].mean().reset_index()
fig = px.bar(
df_grouped,
x="work_mode",
y=["screen_time_hours", "work_screen_hours","leisure_screen_hours"],
barmode="group",
title="Average Screen Time by work_mode",
labels={"value": "Hours", "work_mode": "work_mode", "variable": "Type"},
width=700, height=500
)
fig.show()- sleep quality
다 왜 이렇게 낮아..ㅎㅎㅎ
fig = px.box(
df,
x="work_mode",
y="sleep_quality_1_5",
color="gender",
title="Stress Level by Work Mode and Gender",
height=500,
width=700
)
fig.show()fig = px.histogram(
df,
x="sleep_quality_1_5",
color="gender",
barmode="group",
facet_col="work_mode",
title="Distribution of Sleep Quality by Gender and Work Mode",
height=600,
width=900
)
fig.show()- stress level
remote에 비해 in-person,hybrid 의 stress level이 낮네?
fig = px.box(
df,
x="work_mode",
y="stress_level_0_10",
color="gender",
title="Stress Level by Work Mode and Gender",
height=500,
width=700
)
fig.show()- productivity
in-person, hybrid가 remote에 비해 높은 경향
fig = px.box(
df,
x="work_mode",
y="productivity_0_100",
color="gender",
title="Productivity by Work Mode and Gender",
height=500,
width=700
)
fig.show()- excercise minutes per week
remote라고 많거나 하지 않네
fig = px.box(
df,
x="work_mode",
y="exercise_minutes_per_week",
color="gender",
title="Excercise minutes per week by Work Mode and Gender",
height=500,
width=700
)
fig.show()- Social hours per week
remote가 오히려 낮은 경향, hybrid가 높은 경향
fig = px.box(
df,
x="work_mode",
y="social_hours_per_week",
color="gender",
title="Social hours per week by Work Mode and Gender",
height=500,
width=700
)
fig.show()- Mental Wellness index
remote가 낮은경향, 그렇다고 나머지 높진 않음.
fig = px.box(
df,
x="work_mode",
y="mental_wellness_index_0_100",
color="gender",
title="Mental wellness index by Work Mode and Gender",
height=500,
width=700
)
fig.show()Occupation
student들이 screen 시간이 길지만 leisure hours는 직장인들과 비슷하다
df_long = df.melt(
id_vars="occupation",
value_vars=["screen_time_hours", "work_screen_hours", "leisure_screen_hours"],
var_name="screen_type",
value_name="hours"
)
fig = px.box(
df_long,
x="occupation",
y="hours",
color="screen_type", # work, leisure, total 구분
title="Screen Hours by Occupation",
height=500,
width=1000
)
fig.show()- Sleep Hours
sleep hours는 비슷
fig = px.box(
df,
x="occupation",
y="sleep_hours",
title="Sleep Hours by occupation",
height=500, width= 700)
fig.show()- sleep quality
다 낮다..
fig = px.histogram(
df,
x="sleep_quality_1_5",
color="occupation",
barmode="group",
title="Distribution of Sleep Quality",
height=500,
width=800
)
fig.show()- Stress Level
Stress Level은 student가 평균적으로 높다
fig = px.box(
df,
x="occupation",
y="stress_level_0_10",
title="Stress Level by occupation",
height=500, width= 700)
fig.show()- Productivity
Productivity는 학생이 제일 낮고 retired가 가장 높다
fig = px.box(
df,
x="occupation",
y="productivity_0_100",
title="Productivity by occupation",
height=500, width= 700)
fig.show()- Excercise Minutes
Excercise Minutes는 unemployed가 평균적으로 가장 높고 그다음 retired, 다음은 비슷
fig = px.box(
df,
x="occupation",
y="exercise_minutes_per_week",
title="Excercise Minutes by occupation",
height=500, width= 700)
fig.show()- Social Hours per week
다 비슷하고 student가 낮은 경향
fig = px.box(
df,
x="occupation",
y="social_hours_per_week",
title="Social Hours per week by occupation",
height=500, width= 700)
fig.show()- Mental Wellness
Mental Wellness는 student가 평균적으로 가장 낮고, employed, self-employed가 그다음 낮고 retired, unemployed가 높은 그나저나 student 분산이 꽤나 높을듯
fig = px.box(
df,
x="occupation",
y="mental_wellness_index_0_100",
title="Mental Wellness by occupation",
height=500, width= 700)
fig.show()Trend Check
전반적 경향: screen time이 늘어날수록 sleep 이 줄어든다.
fig = px.scatter(
df,
x="screen_time_hours",
y="sleep_hours",
trendline="ols",
# color='gender',
title="Screen Time vs Sleep Hours",
height=500, width= 700)
fig.show()stress level 이 높을수록 screen time시간이 길고, sleep hours는 줄어드는 trend
fig = px.scatter(
df,
x="screen_time_hours",
y="sleep_hours",
trendline="ols",
color='stress_level_0_10',
title="Screen Time vs Sleep Hours",
height=500, width=700,
labels={"stress_level_0_10": "Stress Level (0-10)"}
)
fig.show()productivity가 낮을수록 screen time hour도 갈고 sleep hour는 낮아지는 경향
fig = px.scatter(
df,
x="screen_time_hours",
y="sleep_hours",
trendline="ols",
color='productivity_0_100',
title="Screen Time vs Sleep Hours",
height=500, width= 700,
labels={"productivity_0_100": "Productivity (0-100)"}
)
fig.show()- Mental Wellness
Mental Wellness가 낮을수록 screen time도 길고 sleep hour도 낮아지는 경향
fig = px.scatter(
df,
x="screen_time_hours",
y="sleep_hours",
trendline="ols",
color='mental_wellness_index_0_100',
title="Screen Time vs Sleep Hours",
height=500, width= 900,
labels={"mental_wellness_index_0_100": "Mental Wellness (0-100)"}
)
fig.show()- screen_time_hours, sleep_hours에 따라 graph 확인해본 결과 stress_level_0_10, productivity_0_100, mental_wellness_index_0_100가 영향이 있어보였다.
Correlation
corr = df[['screen_time_hours','work_screen_hours','leisure_screen_hours','sleep_hours',
'sleep_quality_1_5','stress_level_0_10','productivity_0_100',
'exercise_minutes_per_week','social_hours_per_week','mental_wellness_index_0_100']].corr()
px.imshow(corr, text_auto=True, title="Correlation Heatmap",
height=1000, width= 1000)- 1부터 5점 척도로 이루어진
sleep_quality_1_5때문에 spearman correlation으로
spearman_corr, spearman_p = spearmanr(df['sleep_hours'], df['sleep_quality_1_5'])
print(f"Spearman correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")Spearman correlation: 0.626, p-value: 0.000
- 잠자는 시간과 수면의 질의 상관계수는 0.626, p-value<0.000으로 연관 있다는 결과,
- 아래 그림으로 봐도 양의 상관관계가 있는 것으로 보인다.
fig = px.box(
df,
x='sleep_quality_1_5',
y='sleep_hours',
points="all",
title="Sleep Hours by Sleep Quality (1-5)",
height=500, width= 900
)
fig.show()- stress level vs mental wellness
spearman_corr, spearman_p = spearmanr(df['mental_wellness_index_0_100'], df['stress_level_0_10'])
print(f"Spearman correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")Spearman correlation: -0.871, p-value: 0.000
- 강한 음의 상관관계, 유의미함
px.scatter(df, x="mental_wellness_index_0_100", y="stress_level_0_10", trendline="ols", height=500, width= 900)- mental wellness vs productivity
spearman_corr, spearman_p = spearmanr(df['mental_wellness_index_0_100'], df['productivity_0_100'])
print(f"Spearman correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")Spearman correlation: 0.869, p-value: 0.000
- 강한 양의 상관, 유의미함
px.scatter(df, x="mental_wellness_index_0_100", y="productivity_0_100", trendline="ols", height=500, width= 900)- productivity vs stress level
spearman_corr, spearman_p = spearmanr(df['productivity_0_100'], df['stress_level_0_10'])
print(f"Spearman correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")Spearman correlation: -0.856, p-value: 0.000
- 강한 음의 상관, 생산성이 커질수록 스트레스 수준이 감소
px.scatter(df, x="productivity_0_100", y="stress_level_0_10", height=500, width= 900)- screen time vs stress level
spearman_corr, spearman_p = spearmanr(df['screen_time_hours'], df['stress_level_0_10'])
print(f"Spearman correlation: {spearman_corr:.3f}, p-value: {spearman_p:.3f}")Spearman correlation: 0.719, p-value: 0.000
- 강한 양의 상관, 유의미함, 스크린 타임이 길수로 스트레스가 많아짐
px.scatter(df, x="screen_time_hours", y="stress_level_0_10", height=500, width= 900)