Barplot + 해들리위컴의 그래프레이어

import pandas as pd 
import numpy as np 
from plotnine import * 
import matplotlib.pyplot as plt

기본사용법

g=['A']*100+['B']*200 
y=list(np.random.randn(100)*2+2)+list(np.random.randn(200)+3)
df=pd.DataFrame({'g':g,'y':y})
df

ggplot(df)+geom_bar(aes(x='g',fill='g')) ## 디폴트로 카운트를 수행해줌

<ggplot: (8727603348462)>

- 이것은 아래의 코드와 같다.

df.groupby(by='g').count()

fig=ggplot(df.groupby(by='g').count().reset_index()) # 데이터 정리 후 ggplot사용
fig+geom_bar(aes(x='g',y='y',fill='g'),stat='identity') # 아무런 변환하지 않는다는 stat=identity

<ggplot: (8727602748669)>

- barplot은 기본적으로 groupby+count()가 내장되어 있다. 따라서 아래의 코드

ggplot(df)+geom_bar(aes(x='g',fill='g')) ## 디폴트로 카운트를 수행해줌

를 좀더 엄밀하게 쓰면

ggplot(df)+geom_bar(aes(x='g',fill='g'),stat='count')

<ggplot: (8727602720669)>

- 이것은 때때로 불편하다. 왜냐하면 데이터프레임을 변환하는 것은 판다스를 이용하는게 더 쉽고 자유로움

barplot의 불편한점1

td=df.groupby(by='g').count().reset_index()
td

- 그냥 'x=g, y=y'를 맵핑하여 그리면 안되나?

plt.bar(td.g,td.y)

<BarContainer object of 2 artists>

td.plot(kind='bar',x='g',y='y')

<AxesSubplot:xlabel='g'>

- 그런데 ggplot을 쓰려고 하면?

ggplot(td)+geom_bar(aes(x='g',y='y',fill='g'))

---------------------------------------------------------------------------
PlotnineError                             Traceback (most recent call last)
~/anaconda3/envs/csy/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

~/anaconda3/envs/csy/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

~/anaconda3/envs/csy/lib/python3.8/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    698     """A pprint that just redirects to the normal repr function."""
    699     # Find newlines and replace them with p.break_()
--> 700     output = repr(obj)
    701     lines = output.splitlines()
    702     with p.group():

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/ggplot.py in __repr__(self)
     95         Print/show the plot
     96         """
---> 97         self.__str__()
     98         return '<ggplot: (%d)>' % self.__hash__()
     99 

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/ggplot.py in __str__(self)
     86         Print/show the plot
     87         """
---> 88         self.draw(show=True)
     89 
     90         # Return and empty string so that print(p) is "pretty"

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/ggplot.py in draw(self, return_ggplot, show)
    203         self = deepcopy(self)
    204         with plot_context(self, show=show):
--> 205             self._build()
    206 
    207             # setup

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/ggplot.py in _build(self)
    298 
    299         # Apply and map statistics
--> 300         layers.compute_statistic(layout)
    301         layers.map_statistic(self)
    302 

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
     71     def compute_statistic(self, layout):
     72         for l in self:
---> 73             l.compute_statistic(layout)
     74 
     75     def map_statistic(self, plot):

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
    322         data = self.stat.use_defaults(data)
    323         data = self.stat.setup_data(data)
--> 324         data = self.stat.compute_layer(data, params, layout)
    325         self.data = data
    326 

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_layer(cls, data, params, layout)
    274             return cls.compute_panel(pdata, pscales, **params)
    275 
--> 276         return groupby_apply(data, 'PANEL', fn)
    277 
    278     @classmethod

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/utils.py in groupby_apply(df, cols, func, *args, **kwargs)
    632         # function fn should be free to modify dataframe d, therefore
    633         # do not mark d as a slice of df i.e no SettingWithCopyWarning
--> 634         lst.append(func(d, *args, **kwargs))
    635     return pd.concat(lst, axis=axis, ignore_index=True)
    636 

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/stats/stat.py in fn(pdata)
    272                 return pdata
    273             pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 274             return cls.compute_panel(pdata, pscales, **params)
    275 
    276         return groupby_apply(data, 'PANEL', fn)

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_panel(cls, data, scales, **params)
    305         stats = []
    306         for _, old in data.groupby('group'):
--> 307             new = cls.compute_group(old, scales, **params)
    308             unique = uniquecols(old)
    309             missing = unique.columns.difference(new.columns)

~/anaconda3/envs/csy/lib/python3.8/site-packages/plotnine/stats/stat_count.py in compute_group(cls, data, scales, **params)
     51         if ('y' in data) or ('y' in params):
     52             msg = 'stat_count() must not be used with a y aesthetic'
---> 53             raise PlotnineError(msg)
     54 
     55         weight = data.get('weight', np.ones(len(x), dtype=int))

PlotnineError: 'stat_count() must not be used with a y aesthetic'

ggplot(td)+geom_bar(aes(x='g',y='y',fill='g'),stat='identity')

<ggplot: (8727602574924)>

너무 불편해요.. stat='identity' 를 항상 써야하는것이!

barplot의 불편한점2

- groupby 를 자동으로 해주므로 익숙해지면 ggplot2 방식이 더 편하지 않을까? $\to$ groupby 하는게 더 편해요..

df.groupby('g').agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})

df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack().reset_index()

td=df.groupby('g')\
.agg({'y':[np.mean,np.median,np.std,lambda x: np.max(x)-np.min(x)]})\
.rename(columns={'<lambda_0>':'range'}).stack().reset_index()

ggplot(td)+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity')

<ggplot: (8727602602225)>

쌓인상태로 보이는것이 불편함. $\to$ position='dodge' 로!

position

ggplot(td)+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')

<ggplot: (8727602511454)>

coord_flip()

- 때때로 아래와 같이 보는 것이 더 좋은 경우도 있음

ggplot(td)\
+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()

<ggplot: (8727602577393)>

facet_wrap()

ggplot(td)\
+geom_bar(aes(x='level_1',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()+facet_wrap('level_1')

<ggplot: (8727602445674)>

ggplot(td)\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity',position='dodge')\
+coord_flip()+facet_wrap('level_1')

<ggplot: (8727602310554)>

ggplot(td)+facet_grid('level_1~g')\
+geom_bar(aes(x='g',y='y',fill='g'),stat='identity',position='dodge')+coord_flip()

<ggplot: (8727602187463)>

해들리위컴의 그래프레이어

- 데이터셋 + 맵핑 + 지옴 + 포지션 + 스탯 + 축 + 면분할

데이터셋: 판다스
맵핑: x축, y축, 색깔, 크기, 투명도
지옴: 포인트지옴, 바지옴, 라인지옴, 스무스지옴
포지션: jitter, dodge, intentity
스탯: identity, count
축: coord_flip()
면분할: facet_wrap(), facet_grid()

예제: 심슨의 역설

DEP=(['A1']*2+['A2']*2+['B1']*2+['B2']*2)*2 
GEN=['M']*8+['F']*8
STATE=['PASS','FAIL']*8
COUNT=[1,9,2,8,80,20,85,15,5,5,5,5,9,1,9,1]

df=pd.DataFrame({'DEP':DEP,'STATE':STATE,'GEN':GEN,'COUNT':COUNT})

df

df.groupby(['GEN','STATE']).agg({'COUNT':np.sum})

시각화1: 전체합격률

df.groupby(['GEN','STATE']).agg({'COUNT':np.sum})

df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()

df.groupby(['GEN']).agg({'COUNT':np.sum}).reset_index()

- 두개의 데이터프레임을 합쳐야 한다.

_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()
_df2=df.groupby(['GEN']).agg({'COUNT':np.sum}).reset_index().rename(columns={'COUNT':'SUM'})

display(_df1)
display(_df2)

- 단순한 방법

def f(x): 
    if x=='F':
        return 40 
    if x=='M':
        return 220

_df1['SUM']=list(map(f,_df1.GEN))
_df1

- 좀 더 좋은 방법

_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()

_df1를 다시 롤백

def f(_df2): 
    return lambda x: _df2.query('GEN == @x').SUM.item()

_df1.GEN

0    F
1    F
2    M
3    M
Name: GEN, dtype: object

_df1['SUM']=list(map(f(_df2),_df1.GEN))
_df1

- 더 좋은 방법

_df1=df.groupby(['GEN','STATE']).agg({'COUNT':np.sum}).reset_index()

_df1을 다시 롤백

_df1

_df2

pd.merge(_df1,_df2)

_df1.merge(_df2)

_df2.merge(_df1)

td=_df2.merge(_df1)
td

td['PROP']=td.COUNT/td.SUM

td

ggplot(td.query('STATE=="PASS"'))+geom_bar(aes(x='GEN',y='PROP',fill='GEN'),stat='identity')

<ggplot: (8727603309472)>

- 남자의 합격률이 더 높다. $\to$ 성차별이 있어보인다(?)

시각화2: 학과별 합격률

- 학과별 합격률

df

td=df.groupby(['DEP','GEN']).agg({'COUNT':sum}).reset_index()\
.rename(columns={'COUNT':'SUM'}).merge(df)

td['PROP']=td.COUNT/td.SUM

td

td.query('STATE=="PASS"')

ggplot(td.query('STATE=="PASS"'))\
+geom_bar(aes(x='GEN',y='PROP',fill='GEN'),stat='identity')\
+facet_wrap('DEP')

<ggplot: (8727602080247)>

	g	y
0	A	-0.080619
1	A	0.219652
2	A	1.349767
3	A	3.382926
4	A	7.170487
...	...	...
295	B	3.280803
296	B	3.225636
297	B	3.564125
298	B	3.553260
299	B	5.228008

	y
	mean	median	std	<lambda_0>
g
A	2.035761	1.988402	2.086418	10.562525
B	2.857144	2.875506	0.964429	5.552557

	DEP	STATE	GEN	COUNT
0	A1	PASS	M	1
1	A1	FAIL	M	9
2	A2	PASS	M	2
3	A2	FAIL	M	8
4	B1	PASS	M	80
5	B1	FAIL	M	20
6	B2	PASS	M	85
7	B2	FAIL	M	15
8	A1	PASS	F	5
9	A1	FAIL	F	5
10	A2	PASS	F	5
11	A2	FAIL	F	5
12	B1	PASS	F	9
13	B1	FAIL	F	1
14	B2	PASS	F	9
15	B2	FAIL	F	1

	GEN	SUM	STATE	COUNT	PROP
0	F	40	FAIL	12	0.300000
1	F	40	PASS	28	0.700000
2	M	220	FAIL	52	0.236364
3	M	220	PASS	168	0.763636

	DEP	GEN	SUM	STATE	COUNT	PROP
0	A1	F	10	PASS	5	0.50
1	A1	F	10	FAIL	5	0.50
2	A1	M	10	PASS	1	0.10
3	A1	M	10	FAIL	9	0.90
4	A2	F	10	PASS	5	0.50
5	A2	F	10	FAIL	5	0.50
6	A2	M	10	PASS	2	0.20
7	A2	M	10	FAIL	8	0.80
8	B1	F	10	PASS	9	0.90
9	B1	F	10	FAIL	1	0.10
10	B1	M	100	PASS	80	0.80
11	B1	M	100	FAIL	20	0.20
12	B2	F	10	PASS	9	0.90
13	B2	F	10	FAIL	1	0.10
14	B2	M	100	PASS	85	0.85
15	B2	M	100	FAIL	15	0.15

	y
g
A	100
B	200

	g	y
0	A	100
1	B	200