Reference

data

Import

import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px

Data

df = pd.read_csv('../../../delete/Perfumes_dataset.csv').query("target_audience!='Target Audience'")

category가 Fruity Sweet Gourmand 이고 target_audience가 unisex임

df.at[203, "category"] = "Fruity Sweet Gourmand"
df.at[203, "target_audience"] = "Unisex"

성별 통일

mapping = {
    "Female": "Female",
    "Women": "Female",
    "Male": "Male",
    "Men": "Male",
    "Unisex": "Unisex"
}

df["target_audience"] = df["target_audience"].map(mapping)

category 단어 구별

split_cols = df["category"].str.split(" ", expand=True)
df["cat1"] = split_cols[0]
df["cat2"] = split_cols[1]
df["cat3"] = split_cols[2]

longevity 통일

mapping = {
    "Strong": "Strong",
    "Medium": "Medium",
    "Medium–Strong ": "Medium–Strong",
    "Medium ": "Medium",
    "Strong ": "Strong",
    "Very Strong": "Very Strong",
    "Light": "Light",
    "Light–Medium ": "Light–Medium",
    "Light–Medium": "Light–Medium",
    "6–8 hours": "Light–Medium"
}

df['longevity_level'] = df["longevity"].str.split(":", expand=True)[0]

df["longevity_level"] = df["longevity_level"].map(mapping)
df.head()

	brand	perfume	type	category	target_audience	longevity	cat1	cat2	cat3	longevity_level
0	dumont	nitro red	edp	Fresh Scent	Male	Strong	Fresh	Scent	None	Strong
1	dumont	nitro pour homme	edp	Fresh Scent	Male	Strong	Fresh	Scent	None	Strong
2	dumont	nitro white	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong
3	dumont	nitro blue	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong
4	dumont	nitro green	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong

type 통일

df.type.unique()

array(['edp', 'edt', 'parfum', 'EDP', 'Extrait de Parfum', 'Parfum',
       'EDT', 'Extrait', 'Cologne', 'Alcohol-free', 'Attar',
       'Concentrate', 'Oil'], dtype=object)

Raw 값 예시	표준화 값	설명
`edp`, `EDP`	EDP (Eau de Parfum)	가장 흔한 형태 (15–20%, 5–8시간 지속)
`edt`, `EDT`	EDT (Eau de Toilette)	가볍고 산뜻 (5–15%, 3–5시간)
`parfum`, `Parfum`	Parfum	진하고 오래감 (20–30%, 8–12시간)
`Extrait de Parfum`, `Extrait`	Extrait	최고 농도 (30% 이상, 12시간+)
`Cologne`	EDC (Eau de Cologne)	가볍고 빠르게 사라짐 (2–5%, 1–3시간)
`Alcohol-free`, `Oil`, `Attar`, `Concentrate`	Oil/Attar	알코올 없는 오일 기반 향수, 지속력 강하지만 확산력은 약할 수 있음

mapping = {
    "edp": "EDP",
    "EDP": "EDP",
    "edt": "EDT",
    "EDT": "EDT",
    "parfum": "Parfum",
    "Parfum": "Parfum",
    "Extrait de Parfum": "Extrait",
    "Extrait": "Extrait",
    "Cologne": "EDC",
    "Alcohol-free": "Oil/Attar",
    "Attar": "Oil/Attar",
    "Concentrate": "Oil/Attar",
    "Oil": "Oil/Attar"
}

df["type_standardized"] = df["type"].map(mapping)
df.head()

	brand	perfume	type	category	target_audience	longevity	cat1	cat2	cat3	longevity_level	type_standardized
0	dumont	nitro red	edp	Fresh Scent	Male	Strong	Fresh	Scent	None	Strong	EDP
1	dumont	nitro pour homme	edp	Fresh Scent	Male	Strong	Fresh	Scent	None	Strong	EDP
2	dumont	nitro white	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong	EDP
3	dumont	nitro blue	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong	EDP
4	dumont	nitro green	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong	EDP

brand pie chart를 plotly로 만들자
target_audience pie chart를 plotly로 만들자
category별 target 구분
type별 longevity구분

Brand 상위 20 Ratio

개수 카운트

brand_counts = df["brand"].value_counts()
brand_counts.head()

Jean Paul Gaultier    94
paris corner          77
armaf                 70
Al Haramain           43
fragrance world       42
Name: brand, dtype: int64

상위 20개

top20_brands = brand_counts.nlargest(20).index
top20_brands

Index(['Jean Paul Gaultier', 'paris corner', 'armaf', 'Al Haramain',
       'fragrance world', 'Lattafa', 'Azzaro', 'Hugo Boss', 'Giorgio Armani',
       'Afnan', 'Dior', 'Ajmal', 'Hermès', 'Prada', 'Maison Alhambra',
       'Louis Vuitton', 'Creed', 'Victoria's Secret', 'Carolina Herrera',
       'Dolce & Gabbana'],
      dtype='object')

재그룹화된 열 합치기

df["brand_grouped"] = df["brand"].where(df["brand"].isin(top20_brands), "Others")
df.head()

	brand	perfume	type	category	target_audience	longevity	cat1	cat2	cat3	longevity_level	type_standardized	brand_grouped
0	dumont	nitro red	edp	Fresh Scent	Male	Strong	Fresh	Scent	None	Strong	EDP	Others
1	dumont	nitro pour homme	edp	Fresh Scent	Male	Strong	Fresh	Scent	None	Strong	EDP	Others
2	dumont	nitro white	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong	EDP	Others
3	dumont	nitro blue	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong	EDP	Others
4	dumont	nitro green	edp	Fresh Scent	Unisex	Strong	Fresh	Scent	None	Strong	EDP	Others

pie chart 만들기 위한 데이터셋

brand_grouped_counts = df["brand_grouped"].value_counts().reset_index()
brand_grouped_counts.columns = ["brand", "count"]

plotly

fig = px.pie(
    brand_grouped_counts,
    names="brand",
    values="count",
    title="Top 20 Brands (Others grouped)",
    width=600, height=600,
    template='seaborn'
)
fig.show()

matplotly

labels = brand_grouped_counts["brand"]
sizes = brand_grouped_counts["count"]

# 파이차트 그리기
fig, ax = plt.subplots(figsize=(8,8))
wedges, texts, autotexts = ax.pie(
    sizes,
    labels=labels,
    autopct="%1.1f%%",   # 비율 표시
    startangle=90,       # 시작 각도
    textprops={"color": "black"}
)

ax.set_title("Top 20 Brands (Others grouped)", fontsize=14)
plt.tight_layout()
plt.show()

브랜드는 고르게 분포

성별 비율

target_audience_counts = df["target_audience"].value_counts().reset_index()
target_audience_counts.columns = ["target_audience", "count"]
# target_audience_counts = target_audience_counts.query('count!=1')

plotly

fig = px.pie(
    target_audience_counts,
    names="target_audience",
    values="count",
    title="target_audience",
    width=600, height=600,
    template='seaborn'
)
fig.show()

matplotly

labels = target_audience_counts["target_audience"]
sizes = target_audience_counts["count"]

# 파이차트
fig, ax = plt.subplots(figsize=(6, 6))
wedges, texts, autotexts = ax.pie(
    sizes,
    labels=labels,
    autopct="%1.1f%%",   # 비율 표시
    startangle=90,       # 시작 각도
    textprops={"color": "black"}
)

ax.set_title("Target Audience", fontsize=14)
plt.tight_layout()
plt.show()

비율 동일

category 별 타겟 구분

1st

df["cat1"].value_counts().nlargest(10)

Woody         260
Floriental    136
Oriental      114
Amber          96
Floral         77
Fresh          61
Aromatic       38
Unknown        38
Citrus         33
Fruity         33
Name: cat1, dtype: int64

상위 5개

top5_cat1 = df["cat1"].value_counts().nlargest(5).index
top5_cat1

Index(['Woody', 'Floriental', 'Oriental', 'Amber', 'Floral'], dtype='object')

데이터셋에서 그 5개에 해당하는 것만 가져오기

df_top5 = df[df["cat1"].isin(top5_cat1)]

집계

counts = df_top5.groupby(["target_audience", "cat1"]).size().reset_index(name="count")

bar chart
plotly

fig = px.bar(
    counts,
    x="target_audience",
    y="count",
    color="cat1",
    barmode="group",
    title="Top 5 category(1st) by Target Audience",
    width=1000, height=600,
    template='seaborn'
)
fig.show()

matplotly

pivot_df = counts.pivot(index="target_audience", columns="cat1", values="count").fillna(0)

# grouped bar chart
ax = pivot_df.plot(
    kind="bar",
    figsize=(10, 6),
    width=0.8
)

plt.title("Top 5 category(1st) by Target Audience", fontsize=14)
plt.xlabel("Target Audience")
plt.ylabel("Count")
plt.legend(title="cat1")
plt.xticks(rotation=0)  # x축 라벨 세로로 안 돌리게
plt.tight_layout()
plt.show()

남성은 woody향을 내세우고
여성은 floriental 향을 내세우는군
두 개는 성별로 약간 극단적

2nd

df["cat2"].value_counts().nlargest(10)

Spicy       200
Floral      174
Aromatic     94
Fruity       78
Woody        44
Fougere      34
Scent        25
Oriental     25
Aquatic      20
Gourmand     16
Name: cat2, dtype: int64

상위 5개

top5_cat2 = df["cat2"].value_counts().nlargest(5).index
top5_cat2

Index(['Spicy', 'Floral', 'Aromatic', 'Fruity', 'Woody'], dtype='object')

데이터셋에서 그 5개에 해당하는 것만 가져오기

df_top5 = df[df["cat2"].isin(top5_cat2)]

집계

counts = df_top5.groupby(["target_audience", "cat2"]).size().reset_index(name="count")

bar chart
plotly

fig = px.bar(
    counts,
    x="target_audience",
    y="count",
    color="cat2",
    barmode="group",
    title="Top 5 category(2nd) by Target Audience",
    width=1000, height=600,
    template='seaborn'
)
fig.show()

matplotly

pivot_df = counts.pivot(index="target_audience", columns="cat2", values="count").fillna(0)

# grouped bar chart
ax = pivot_df.plot(
    kind="bar",
    figsize=(10, 6),
    width=0.8
)

plt.title("Top 5 category(2nd) by Target Audience", fontsize=14)
plt.xlabel("Target Audience")
plt.ylabel("Count")
plt.legend(title="cat2")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

남성은 spicy향 아주 내세우고
여성은 floral향
이것도 성별로 극단적

type 별 구분

df.type_standardized.value_counts()

EDP          764
EDT          161
Parfum        39
Extrait       21
EDC           11
Oil/Attar      7
Name: type_standardized, dtype: int64

order = ["Light","Light–Medium","Medium","Medium–Strong","Strong","Very Strong"]
by_type = df.groupby(["type_standardized","longevity_level"]).size().reset_index(name="count")
type_order = by_type.groupby("type_standardized")["count"].sum().sort_values(ascending=False).index

plotly

custom_colors = {
    "Light": "#a6cee3",
    "Light–Medium": "#1f78b4",
    "Medium": "#33a02c",
    "Medium–Strong": "#fb9a99",
    "Strong": "#e31a1c",
    "Very Strong": "#6a3d9a"
}

fig = px.bar(
    by_type, 
    x="type_standardized", 
    y="count", 
    color="longevity_level",
    barmode="stack",
    category_orders={"type_standardized": list(type_order), "longevity_level": order},
    title="Longevity Levels by Type (counts)",
    width=1000, height=800,
    template='seaborn',
    color_discrete_map=custom_colors  # 직접 매핑
)

fig.show()

matplotly

custom_colors = {
    "Light": "#a6cee3",
    "Light–Medium": "#1f78b4",
    "Medium": "#33a02c",
    "Medium–Strong": "#fb9a99",
    "Strong": "#e31a1c",
    "Very Strong": "#6a3d9a"
}

fig = px.bar(
    by_type,
    x="type_standardized",
    y="count",
    color="longevity_level",  
    barmode="stack",
    category_orders={
        "type_standardized": list(type_order),
        "longevity_level": list(custom_colors.keys())  # 순서 고정
    },
    title="Longevity Levels by Type (counts)",
    width=1000, height=800,
    template='seaborn',
    color_discrete_map=custom_colors
)

fig.show()

Raw 값 예시	표준화 값	설명
`edp`, `EDP`	EDP (Eau de Parfum)	가장 흔한 형태 (15–20%, 5–8시간 지속)
`edt`, `EDT`	EDT (Eau de Toilette)	가볍고 산뜻 (5–15%, 3–5시간)
`parfum`, `Parfum`	Parfum	진하고 오래감 (20–30%, 8–12시간)
`Extrait de Parfum`, `Extrait`	Extrait	최고 농도 (30% 이상, 12시간+)
`Cologne`	EDC (Eau de Cologne)	가볍고 빠르게 사라짐 (2–5%, 1–3시간)
`Alcohol-free`, `Oil`, `Attar`, `Concentrate`	Oil/Attar	알코올 없는 오일 기반 향수, 지속력 강하지만 확산력은 약할 수 있음

EDP만
plotly

fig = px.pie(
    by_type.query("type_standardized == 'EDP'"),
    names="longevity_level",
    values="count",
    title="EDP",
    width=600, height=600,
    template='seaborn',
    color="longevity_level",                # 색 기준 열 지정
    color_discrete_map=custom_colors        # 색 매핑
)
fig.show()

matplotly

fig = px.pie(
    by_type.query("type_standardized == 'EDP'"),
    names="longevity_level",
    values="count",
    title="EDP",
    width=600, height=600,
    template='seaborn',
    color="longevity_level",
    color_discrete_map=custom_colors,
    category_orders={"longevity_level": list(custom_colors.keys())}  # 순서 고정
)
fig.show()

edp는 medium이 많고 그다음 strong

성별 longevity level

df.target_audience.value_counts()

Unisex    376
Female    331
Male      296
Name: target_audience, dtype: int64

plotly

# 집계
by_aud = df.groupby(["target_audience","longevity_level"]).size().reset_index(name="count")

# 각 target_audience별 총합
by_aud["percent"] = by_aud.groupby("target_audience")["count"].transform(lambda x: x / x.sum() * 100)

# bar chart (percent 사용)
fig = px.bar(
    by_aud,
    x="target_audience",
    y="percent",
    color="longevity_level",
    barmode="stack",
    category_orders={"longevity_level": ["Light","Light–Medium","Medium","Medium–Strong","Strong","Very Strong"]},
    title="Longevity distribution by Target Audience (%)",
    width=800, height=600,
    template='seaborn',
    color_discrete_map=custom_colors 
)
fig.show()

matplotly

pivot_df = by_aud.pivot(index="target_audience", columns="longevity_level", values="percent").fillna(0)

# 색상 매핑
custom_colors = {
    "Light": "#a6cee3",
    "Light–Medium": "#1f78b4",
    "Medium": "#33a02c",
    "Medium–Strong": "#fb9a99",
    "Strong": "#e31a1c",
    "Very Strong": "#6a3d9a"
}
colors = [custom_colors[col] for col in pivot_df.columns]

# stacked bar chart
ax = pivot_df.plot(
    kind="bar",
    stacked=True,
    color=colors,
    figsize=(10, 6),
    width=0.8
)

plt.title("Longevity distribution by Target Audience (%)", fontsize=14)
plt.xlabel("Target Audience")
plt.ylabel("Percent")
plt.legend(title="Longevity Level", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()