[Kaggle] Holistic Health and Lifestyle Score

Author

SEOYEON CHOI

Published

November 17, 2025

data

Data

  • This synthetic yet realistic dataset evaluates individuals’ overall well-being based on multiple physical and lifestyle factors. Each row represents a person’s health and lifestyle attributes, scored from real-world patterns, to predict an Overall Health Score (0–100).
  • Columns
Column Name Description Type Example
Physical_Activity Minutes of moderate to vigorous exercise per day (0–120) Numeric 45
Nutrition_Score Diet quality (0–10) based on nutrient density, balance, and variety Numeric 8
Stress_Level Self-reported stress (1–10), where 10 = very high stress Numeric 6
Mindfulness Minutes per day spent in meditation, journaling, or mindful activity (0–60) Numeric 10
Sleep_Hours Average hours of sleep per night (3–10) Numeric 7.5
Hydration Liters of water consumed per day (0.5–5.0) Numeric 2.8
BMI Body Mass Index (18–40) Numeric 24.3
Alcohol Units of alcohol per week (0–20) Numeric 3
Smoking Cigarettes per day (0–30) Numeric 5
  • Regression → Predict Overall_Health_Score
  • Classification → Create Health_Status label (e.g., “Poor”, “Average”, “Good”)
  • Feature Importance → Which lifestyle factor affects health most
  • Dashboard → Build interactive visualizations in Streamlit or Plotly Dash

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('../../../delete/holistic_health_lifestyle_dataset.csv')
df.head()
Physical_Activity Nutrition_Score Stress_Level Mindfulness Sleep_Hours Hydration BMI Alcohol Smoking Overall_Health_Score Health_Status
0 54.934283 5.643011 5.696572 0.000000 6.292214 2.578565 24.275932 4.280610 8.984006 36.950187 Poor
1 42.234714 6.389001 5.566647 4.450144 8.519054 2.448713 25.970141 7.461846 3.223304 55.167774 Average
2 57.953771 5.805238 3.126960 9.129716 6.702720 3.261433 25.193857 0.000000 4.600482 78.304426 Good
3 75.460597 7.220836 6.159168 16.496689 7.135854 3.726265 19.527300 9.958423 3.947706 94.018274 Good
4 40.316933 9.394357 2.019835 25.241623 8.076086 3.049478 23.348229 4.320347 8.084322 100.000000 Good
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Physical_Activity     10000 non-null  float64
 1   Nutrition_Score       10000 non-null  float64
 2   Stress_Level          10000 non-null  float64
 3   Mindfulness           10000 non-null  float64
 4   Sleep_Hours           10000 non-null  float64
 5   Hydration             10000 non-null  float64
 6   BMI                   10000 non-null  float64
 7   Alcohol               10000 non-null  float64
 8   Smoking               10000 non-null  float64
 9   Overall_Health_Score  10000 non-null  float64
 10  Health_Status         10000 non-null  object 
dtypes: float64(10), object(1)
memory usage: 859.5+ KB

Regression → Predict Overall_Health_Score

X = df.drop(['Overall_Health_Score','Health_Status'],axis=1)
y = df['Overall_Health_Score']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4, random_state=1212)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = model.predict(X_test)
y_pred
array([ 54.80081517,  87.5517459 , 106.87897538, ...,  85.6034847 ,
        76.16164305,  43.69768659])
r2 = r2_score(y_test,y_pred)
round(r2,4)
0.9053
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
round(rmse,4)
6.1302
plt.figure(figsize=(12,6))
plt.plot(y_test,y_pred,'.')
plt.plot([y.min(), y.max()],[y.min(), y.max()],'k--')
plt.xlabel('Actual Overall Health Score')
plt.ylabel('Predicted Overall Health Score')
plt.title('Actual vs Predicted (Linear Regression)')
Text(0.5, 1.0, 'Actual vs Predicted (Linear Regression)')

fig = px.scatter(
    x = y_test,
    y=y_pred,
    labels={'x':'Actual Overall Health Score', 'y':'Predicted Overall Health Score'},
    title='Actual vs Predicted (Interaction)',
    width=1000,
    height=600,
    template='seaborn'
)

fig.add_shape(
    type = 'line',
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max(),
    line=dict(dash='dash')
)

fig
pd.DataFrame({'feature':X.columns,
             'Coefficient':model.coef_})
feature Coefficient
0 Physical_Activity 0.157633
1 Nutrition_Score 3.954692
2 Stress_Level -1.509945
3 Mindfulness 1.508616
4 Sleep_Hours 2.342065
5 Hydration 3.877228
6 BMI -0.961926
7 Alcohol -1.198366
8 Smoking -1.216072
fig = px.bar(
    pd.DataFrame({'feature':X.columns,
             'Coefficient':model.coef_}).sort_values(by='Coefficient',ascending=False),
    x='feature',
    y='Coefficient',
    title='Feature Importance (Linear Regression Coefficients)',
    width=1200,
    height=500,
    template='seaborn'
)
fig.update_traces(textfont_size=10)
fig

Classification

df.columns
Index(['Physical_Activity', 'Nutrition_Score', 'Stress_Level', 'Mindfulness',
       'Sleep_Hours', 'Hydration', 'BMI', 'Alcohol', 'Smoking',
       'Overall_Health_Score', 'Health_Status'],
      dtype='object')
feature_cols  = ['Physical_Activity', 'Nutrition_Score', 'Stress_Level', 'Mindfulness',
       'Sleep_Hours', 'Hydration', 'BMI', 'Alcohol', 'Smoking']
target_col = 'Health_Status'
X = df[feature_cols]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y,
                                    test_size=0.3,
                                    random_state=1212,
                                    stratify=y
                                    )
rf_clf = RandomForestClassifier(
    n_estimators = 200,
    max_depth = 500,
    random_state = 1212
)
rf_clf.fit(X_train, y_train)
RandomForestClassifier(max_depth=500, n_estimators=200, random_state=1212)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = rf_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.9016666666666666
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

     Average       0.82      0.83      0.83       847
        Good       0.93      0.96      0.95      2028
        Poor       0.96      0.42      0.59       125

    accuracy                           0.90      3000
   macro avg       0.91      0.74      0.79      3000
weighted avg       0.90      0.90      0.90      3000
print(confusion_matrix(y_test, y_pred))
[[ 704  141    2]
 [  80 1948    0]
 [  72    0   53]]
rf_clf.feature_importances_
array([0.05958089, 0.14695664, 0.05456092, 0.39132416, 0.06662972,
       0.0553765 , 0.05974965, 0.06450309, 0.10131843])
sorted(rf_clf.feature_importances_)
[0.054560915836208625,
 0.055376503787369434,
 0.05958089210684196,
 0.05974965275725182,
 0.06450308658146583,
 0.06662971724764624,
 0.10131843171547394,
 0.14695663514721352,
 0.3913241648205286]
df.iloc[y_test.index,:].Health_Status
356        Good
8088       Good
2634       Good
9979    Average
6928    Average
         ...   
6364       Good
5786    Average
7395    Average
4825       Good
3079    Average
Name: Health_Status, Length: 3000, dtype: object
le=LabelEncoder()
y_test_num = le.fit_transform(y_test)
plt.figure(figsize=(10,6))
plt.scatter(X_test.Mindfulness,X_test.Nutrition_Score,c=y_test_num,alpha= 0.6)
plt.xlabel('Mindfulness')
plt.ylabel('Nutrition Score')
plt.title('Mindfulness and Nutrition Score with Health Status')
Text(0.5, 1.0, 'Mindfulness and Nutrition Score with Health Status')

fig = px.scatter(
    X_test,
        x='Mindfulness',
        y='Nutrition_Score',
        color=y_test,
        opacity=0.6,
        labels={'color': 'Health Status'},
        title='Mindfulness and Nutrition Score with Health Status',
    width=1000,
    height=600,
    template='seaborn'
    )
fig
y_pred_num = le.fit_transform(y_pred)
plt.figure(figsize=(10,6))
plt.scatter(X_test.Mindfulness,X_test.Nutrition_Score,c=y_pred_num,alpha= 0.6)
plt.xlabel('Mindfulness')
plt.ylabel('Nutrition Score')
plt.title('RandomForest Classifiaction for Mindfulness and Nutrition Score with Health Status')
Text(0.5, 1.0, 'RandomForest Classifiaction for Mindfulness and Nutrition Score with Health Status')

fig = px.scatter(
    X_test,
        x='Mindfulness',
        y='Nutrition_Score',
        color=y_pred,
        opacity=0.6,
        labels={'color': 'Predicted Health Status'},
        title='Random Forest for Mindfulness and Nutrition Score with Health Status',
    width=1000,
    height=600,
    template='seaborn'
    )
fig
  • LogisticRegression
log_clf = LogisticRegression(max_iter=1000)
log_clf.fit(X_train, y_train)
/home/csy/anaconda3/envs/temp_csy/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:460: ConvergenceWarning:

lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
LogisticRegression(max_iter=1000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred = log_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
0.995
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

     Average       1.00      0.99      0.99       847
        Good       1.00      1.00      1.00      2028
        Poor       0.98      0.98      0.98       125

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000
print(confusion_matrix(y_test, y_pred))
[[ 836    9    2]
 [   1 2027    0]
 [   3    0  122]]
rf_clf.feature_importances_
array([0.05958089, 0.14695664, 0.05456092, 0.39132416, 0.06662972,
       0.0553765 , 0.05974965, 0.06450309, 0.10131843])
sorted(rf_clf.feature_importances_)
[0.054560915836208625,
 0.055376503787369434,
 0.05958089210684196,
 0.05974965275725182,
 0.06450308658146583,
 0.06662971724764624,
 0.10131843171547394,
 0.14695663514721352,
 0.3913241648205286]
y_pred_num = le.fit_transform(y_pred)
plt.figure(figsize=(10,6))
plt.scatter(X_test.Mindfulness,X_test.Nutrition_Score,c=y_pred_num,alpha= 0.6)
plt.xlabel('Mindfulness')
plt.ylabel('Nutrition Score')
plt.title('RandomForest Classifiaction for Mindfulness and Nutrition Score with Health Status')
Text(0.5, 1.0, 'RandomForest Classifiaction for Mindfulness and Nutrition Score with Health Status')

fig = px.scatter(
    X_test,
        x='Mindfulness',
        y='Nutrition_Score',
        color=y_pred,
        opacity=0.6,
        labels={'color': 'Predicted Health Status'},
        title='Random Forest for Mindfulness and Nutrition Score with Health Status',
    width=1000,
    height=600,
    template='seaborn'
    )
fig

Dashboard

fig = px.bar(
    df["Health_Status"].value_counts().reset_index(),
    x="index",
    y="Health_Status",
    labels={"index": "", "Health_Status": "Count"},
    title="Health Status",
    width=1000,
    height=400,
    template='seaborn'
)
fig.show()
df.corr()
FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
Physical_Activity Nutrition_Score Stress_Level Mindfulness Sleep_Hours Hydration BMI Alcohol Smoking Overall_Health_Score
Physical_Activity 1.000000 -0.008942 0.023377 -0.011513 -0.009958 -0.021959 -0.016279 -0.021592 0.004774 0.146014
Nutrition_Score -0.008942 1.000000 0.027171 0.014820 -0.003984 0.000604 -0.009665 0.001983 -0.003347 0.385678
Stress_Level 0.023377 0.027171 1.000000 0.001435 0.008659 -0.004224 -0.000406 -0.013165 -0.001170 -0.129315
Mindfulness -0.011513 0.014820 0.001435 1.000000 0.008517 -0.008026 0.014029 0.017807 0.005669 0.715856
Sleep_Hours -0.009958 -0.003984 0.008659 0.008517 1.000000 -0.010928 0.008973 0.002004 0.001971 0.171856
Hydration -0.021959 0.000604 -0.004224 -0.008026 -0.010928 1.000000 -0.002795 0.013731 0.013585 0.142044
BMI -0.016279 -0.009665 -0.000406 0.014029 0.008973 -0.002795 1.000000 0.005597 -0.001867 -0.160278
Alcohol -0.021592 0.001983 -0.013165 0.017807 0.002004 0.013731 0.005597 1.000000 -0.023092 -0.174725
Smoking 0.004774 -0.003347 -0.001170 0.005669 0.001971 0.013585 -0.001867 -0.023092 1.000000 -0.294431
Overall_Health_Score 0.146014 0.385678 -0.129315 0.715856 0.171856 0.142044 -0.160278 -0.174725 -0.294431 1.000000
fig = px.scatter(
    df,
    x='Smoking',
    y='Hydration',
    color="Health_Status",
    title=f"{x_col} vs {y_col}",
    hover_data=["Overall_Health_Score"],
    width=1000,
    height=600,
    template='seaborn'
)
fig.show()
fig = px.scatter(
    df,
    x='Alcohol',
    y='Hydration',
    color="Health_Status",
    title=f"{x_col} vs {y_col}",
    hover_data=["Overall_Health_Score"],
    width=1000,
    height=600,
    template='seaborn'
)
fig.show()
fig = px.histogram(
    df,
    x="Overall_Health_Score",
    nbins=30,
    color="Health_Status",
    title="Overall Health Score",
    width=1000,
    height=400,
    template='seaborn'
)
fig.show()
fig = px.box(
    df,
    x="Health_Status",
    y="Mindfulness",  
    color="Health_Status",
    title="Sleep Hours by Health Status",
    width=1000,
    height=400,
    template='seaborn'
)
fig.show()
fig = px.box(
    df,
    x="Health_Status",
    y="Nutrition_Score",  
    color="Health_Status",
    title="Sleep Hours by Health Status",
    width=1000,
    height=400,
    template='seaborn'
)
fig.show()
fig = px.box(
    df,
    x="Health_Status",
    y="Smoking",  
    color="Health_Status",
    title="Sleep Hours by Health Status",
    width=1000,
    height=400,
    template='seaborn'
)
fig.show()