In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
1. 위치 추정과 범위 추정¶
- 위치 추정 : 데이터의 대푯값으로 feature의 특징 추정
EDA 방법에서는 대푯값으로 중앙값(Median)을 많이 씀
- 변이 추정 : 데이터의 분포로 feature의 특징 추정
표준편차, 분산이 대표적은 분포 확인 값
In [2]:
import pandas as pd
df = pd.DataFrame({
'A' : [1, 2, 3, 4, 5, 6],
'B' : [1, 2, 3, 4, 5, 100]
})
df
Out[2]:
| A | B | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 2 | 2 |
| 2 | 3 | 3 |
| 3 | 4 | 4 |
| 4 | 5 | 5 |
| 5 | 6 | 100 |
In [3]:
df.describe() # Dataframe의 다섯 수치 요약 확인 가능
Out[3]:
| A | B | |
|---|---|---|
| count | 6.000000 | 6.000000 |
| mean | 3.500000 | 19.166667 |
| std | 1.870829 | 39.625329 |
| min | 1.000000 | 1.000000 |
| 25% | 2.250000 | 2.250000 |
| 50% | 3.500000 | 3.500000 |
| 75% | 4.750000 | 4.750000 |
| max | 6.000000 | 100.000000 |
3. 수치형 데이터의 요약(탐색)¶
- 다섯 수치 요약 (5 number summary) 확인하기
- 최소값(minimum), 제1사분위수, 중간값(mediam)=제2사분위수, 제3사분위수, 최대값(maximum) 확인하기
- 분위수(quartile)
- 자료 크기 순서에 따른 위치값(경계값)
- EDA 에서는 사분위수를 사용
- Q1: 제1사분위수 (25%에 해당하는 값)
- Q2: 제2사분위수 (50%에 해당하는 값, 중간값)
- Q3: 제3사분위수 (75%에 해당하는 값)
- Q4: 제4사분위수 (100%에 해당하는 값, 최대값)
Box Plot으로 Outlier 확인해보기¶
In [4]:
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True) # 주피터 노트북에서 바로 Plotting 해보기위한 code
In [5]:
# iplot 으로 확인해보기
df.iplot(kind = 'box')
# 확인 결과 B Data의 분포가 이상함 (Outlier 는 표기되지 않음)
In [6]:
# 2. plotly.graph_objects로 그려보기
In [7]:
import plotly.graph_objects as go
import plotly.offline as pyo # jupyter notebook에서 보여지도록 설정하는 부분
pyo.init_notebook_mode()
In [8]:
import plotly.graph_objects as go
fig = go.Figure() # fig 객체 선언
fig.add_trace( # fig 객체에 y = df['A'] , name = 'A' 인 Plot 할당
go.Box(
y = df['A'], name = 'A'
)
)
fig.add_trace( # fig 객체에 y = df['B'] , name = 'B' 인 Plot 할당
go.Box(
y = df['B'], name = 'B'
)
)
fig.show()
# print(fig)
4. 수치형 데이터 분포 확인¶
- 도수분포표 (frequency table): 수치형 데이터를 구간으로 나눠서 각 구간에 속하는 데이터의 갯수를 나타내는 표
- 히스토그램 (histogram) 그래프: 도수 분포표를 시각적으로 표현한 그래프
In [9]:
import pandas as pd
import numpy as np
# numpy rand 함수 사용해서 0 ~ 1범위의 난수 1만개 생성
df = pd.DataFrame(np.random.rand(10000, 1), columns = ['A'])
df.head()
Out[9]:
| A | |
|---|---|
| 0 | 0.820261 |
| 1 | 0.272102 |
| 2 | 0.978598 |
| 3 | 0.736549 |
| 4 | 0.000674 |
In [10]:
# 1. iplot으로 히스토그램 그려보기
df.iplot(kind = 'histogram', bins = 10) # 히스토그램 구간 10개로
In [11]:
# 2. plotly.graph_objects로 그려보기
import plotly.graph_objects as go
fig = go.Figure() # fig 객체 생성
fig.add_trace( # fig 객체에 그래프 할당?
go.Histogram( # fig 객체에 Histogram 그래프 정의
x = df['A'], name = 'A', # x = df['A'], 이름은 A로
xbins = dict(
start = 0, # x는 0에서 시작 1에서 끝
end = 1.0,
size = 0.05 # 0 ~ 1 구간을 0.05 간격으로 분할
),
marker_color = '#F50057'
)
)
fig.update_layout(
title_text = 'Sampled Results',
xaxis_title_text = 'Value',
yaxis_title_text = 'Count',
bargap = 0.1
)
fig.show()
5. 범주형 데이터 분석¶
In [12]:
data = {
'year' : ['2017', '2017', '2019', '2020', '2021', '2021'],
'grade' : ['C', 'C', 'B', 'A', 'B', 'E'],
}
print(type(data))
df = pd.DataFrame(data)
print(type(df))
<class 'dict'> <class 'pandas.core.frame.DataFrame'>
In [13]:
# groupby 로 feature별로 묶어서 확인해보기
df1 = df.groupby('year').count()
df2 = df.groupby('grade').count()
In [14]:
df1 # year를 기준으로, year에 매칭된 grade 갯수 표현
Out[14]:
| grade | |
|---|---|
| year | |
| 2017 | 2 |
| 2019 | 1 |
| 2020 | 1 |
| 2021 | 2 |
In [15]:
df2 # grade 를 기준으로, grade에 매칭된 year 갯수 표현
Out[15]:
| year | |
|---|---|
| grade | |
| A | 1 |
| B | 2 |
| C | 2 |
| E | 1 |
6. 범주형 데이터 분석에 주로 사용되는 데이터¶
- 막대 그래프 (절대빈도)
- 원 그래프 (상대빈도)
In [16]:
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True)
import pandas as pd
In [17]:
data = {
'year' : ['2017', '2017', '2019', '2020', '2021', '2021'],
'grade' : ['C', 'C', 'B', 'A', 'B', 'E'],
}
df = pd.DataFrame(data)
df1 = df.groupby('year').count()
df2 = df.groupby('grade').count()
In [18]:
# 1. iplot으로 막대그래프
df.iplot(kind = 'bar')
In [19]:
# 2. plotly.graph_objects로 그려보기
fig = go.Figure() # Figure 객체 선언
fig.add_trace(
go.Bar(
x = df2.index, y = df2['year'], name = 'A'
)
)
7. 테이블 데이터와 시계열 데이터 다뤄보기¶
- 데이터 테이블 : 행과 열로 이루어진 데이터
- feature : 테이블의 각 열을 의미
- record : 테이블의 각 행을 의미
- index : 각 데이터 위치를 식별하기 위한 값
In [20]:
# 시계열 데이터 생성
pd.date_range(start = '2021-01-01', end = '2021-12-31')
Out[20]:
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
'2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
'2021-01-09', '2021-01-10',
...
'2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
'2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
'2021-12-30', '2021-12-31'],
dtype='datetime64[ns]', length=365, freq='D')
In [21]:
# 3일 간격으로 시계열 데이터 만들기
pd.date_range(start = '2021-01-01', end = '2021-12-31', periods = 3)
# start ~ end 사이의 시간을 3개로 나눔
Out[21]:
DatetimeIndex(['2021-01-01', '2021-07-02', '2021-12-31'], dtype='datetime64[ns]', freq=None)
pandas Dataframe 작성 방법 정리¶
- pd.DataFrame(data={컬럼이름:컬럼데이터리스트})
- {컬럼이름:컬럼데이터리스트} 는 사전 데이터 형식
- pd.DataFrame(data=리스트, columns=컬럼이름, index=인덱스데이터)
In [22]:
import numpy as np
import pandas as pd
date_index = pd.date_range('2021-01-01', periods = 15) # 15개의 데이터를 하루 간격으로
df = pd.DataFrame(data = range(len(date_index)), columns = ['count'], index = date_index)
df
Out[22]:
| count | |
|---|---|
| 2021-01-01 | 0 |
| 2021-01-02 | 1 |
| 2021-01-03 | 2 |
| 2021-01-04 | 3 |
| 2021-01-05 | 4 |
| 2021-01-06 | 5 |
| 2021-01-07 | 6 |
| 2021-01-08 | 7 |
| 2021-01-09 | 8 |
| 2021-01-10 | 9 |
| 2021-01-11 | 10 |
| 2021-01-12 | 11 |
| 2021-01-13 | 12 |
| 2021-01-14 | 13 |
| 2021-01-15 | 14 |
8. 시계열 데이터 다뤄보기¶
- 라인 그래프
- 막대 그래프
In [23]:
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True)
# iplot으로 그려보기
df.iplot(kind = 'line')
In [24]:
# plotly.graph_objects로 그려보기
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Scatter(
x = df.index, y = df['count'], mode = 'lines + markers'
)
)
9. 데이터간 상관관계 확인해보기 (Heatmap)¶
In [25]:
import chart_studio.plotly as py
import cufflinks as cf
import pandas as pd
cf.go_offline(connected = True)
doc = pd.read_csv('../COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv', encoding='utf-8-sig')
# df.head()
doc2 = df.corr()
doc2
Out[25]:
| count | |
|---|---|
| count | 1.0 |
In [26]:
# 1. iplot으로 heatmap 그려보기
doc2.iplot(kind = 'heatmap', colorscale = 'ylorrd')
In [27]:
cf.help('heatmap')
HEATMAP
Heatmap
Parameters:
===========
center_scale : float
Centers the colorscale at a specific value
Automatically sets the (zmin,zmax) values
zmax : float
Defines the maximum range for the z values
This affects the range for the colorscale
zmin : float
Defines the minimum range for the z values
This affects the range for the colorscale
colors : dict, list or string
Trace color
string : applies to all traces
list : applies to each trace in the order specified
dict : {column:value} for each column in the dataframe
values
colorname : see cufflinks.colors.cnames
hex : '#ffffff'
rgb : 'rgb(23,50,23)'
rgba : 'rgba(23,50,23,.5)
colorscale : string
Color scale name
If the color is preceded by a minus (-)
then the scale is inversed.
Only valid if 'colors' is null.
see cufflinks.colors.scales() for all available scales
data : figure
Plotly Figure
rangeslider : bool or dict
Defines if a range slider is displayed.
If True : displays a range slider
If dict : defines a range slider object
Example:
{'bgcolor':('blue',.3),'autorange':True}
text : string
Name of the column that contains the text values
width : int, list or dict
Line width
int : applies to all traces
list : applies to each trace in the order specified
dict : {column:value} for each column in the dataframe
ANNOTATIONS
annotations : dict
Dictionary of annotations
{x_point : text}
fontcolor : string
Text color
fontsize : int
Text size
textangle : int
Text angle
EXPORTS
asFigure : bool
If True then it returns a Plotly Figure
asImage : bool
If True then it returns an image (PNG)
While in ONLINE mode:
Image file is saved in the working directory
Accepts:
filename
dimensions
scale
display_image
While in OFFLINE mode:
Image file is downloaded (downloads folder) and a
regular plotly chart is displayed in Jupyter
Accepts:
filename
dimensions
asPlot : bool
If True then the chart opens in a browser
asURL : bool
If True the chart url/path is returned.
No chart is displayed.
If ONLINE : The URL is returned
If OFFLINE : the local path is returned
display_image : bool
If True, then the image is displayed
after being saved.
Only valid if 'asImage=True'
filename : string
Filename to be saved as
online : bool
If True then the chart/image is rendered on the server
even when running in Offline mode
scale : int
Increase the resolution of the image by `scale` amount
Only valid if 'asImage=True'
sharing : string
Sets the sharing level permission
public - anyone can see the chart
private - only you can see this chart
secret - only people with the link can see the chart
LAYOUT
layout : Plotly Layout
If defined, this Layout is explicitly used for the Figure generation
dimensions : tuple
Dimensions for image/chart
(width,height)
fontfamily : string
HTML Font typeface that will be applied
It needs to exist on the system on which it operates.
Examples:
'Times New Roman'
'Open Sans'
'Monospace'
gridcolor : string
Sets the grid color
colorname : see cufflinks.colors.cnames
hex : '#ffffff'
rgb : 'rgb(23,50,23)'
rgba : 'rgba(23,50,23,.5)
legend : string
Defines where the legend should appear
Values:
bottom
top
margin : dict or tuple
Sets the margin dimensions
{'l':left,'r':right,'b':bottom,'t':top}
(left,right,bottom,top)
showlegend : bool
Defines if the legend should appear
theme : string
Layout theme
solar
pearl
white
see cufflinks.getThemes() for all available themes
title : string
Chart title
xTitle : string
X Axis Title
yTitle : string
Y Axis Title
zerolinecolor : string
Sets the zero line color
colorname : see cufflinks.colors.cnames
hex : '#ffffff'
rgb : 'rgb(23,50,23)'
rgba : 'rgba(23,50,23,.5)
layout_update : dict
The Layout will be explicitly modified
with the values stated in the dictionary.
Not valid when Layout is passed as a parameter
SHAPES
hline : float, list or dict
Draws a horizontal line at the indicated 'y'
position(s).
Extra parameters can be passed in the form of a
dictionary (see 'shapes')
hline=4
hline=[2,10]
hline=[{'y':2,'color':'blue'},{'y':3,'color':'red'}]
hspan : tuple, list or dict
Draws a horizontal rectangle at the indicated
(y0,y1) positions.
Extra parameters can be passed in the form of a
dictionary (see 'shapes')
hspan=(1,5)
hspan=[(1,4),(6,10)]
hspan=[{'y0':2,'y1':5,'color':'blue','fill':True,'opacity':.4}]
vline : float, list or dict
Draws a vertical line at the indicated 'x'
position(s).
Extra parameters can be passed in the form of a
dictionary (see 'shapes')
vline=4
vline=[2,10]
vline=[{'x':'2015-02-08','color':'blue'},{'x':'2015-03-08','color':'red'}]
vspan : tuple, list or dict
Draws a vertical rectangle at the indicated
(x0,x1) positions.
Extra parameters can be passed in the form of a
dictionary (see 'shapes')
vspan=('2015-02-08','2015-03-08')
vspan=[(1,4),(6,10)]
vspan=[{'x0':2,'x1':5,'color':'blue','fill':True,'opacity':.4}]
shapes : list or dict
List of dictionaries with the specification
of a given shape.
For more information
see help(cufflinks.tools.get_shape)
SUBPLOTS
horizontal_spacing : float [0-1]
Space between subplot columns
shape : (int,int)
Indicates the size of rows and columns.
If ommitted, then the shape is automatically set
* Only valid if subplots=True
(rows,columns)
shared_xaxes : bool
If True, subplots in the same grid column have one common
shared x-axis at the bottom of the grid.
shared_yaxes : bool
If True, subplots in the same grid row have one common
shared y-axis at the left of the grid.
subplot_titles : bool
If True, chart titles are displayed
at the top of each subplot.
subplots : bool
If True then each trace is placed in a subplot
vertical_spacing : float [0-1]
Space between subplot rows
AXIS
logx : bool
Sets the x axis to be of logarithmic scale
logy : bool
Sets the y axis to be of logarithmic scale
logz : bool
Sets the z axis to be of logarithmic scale
xrange : tuple
Sets the range for the x axis
(lower_bound,upper_bound)
yrange : tuple
Sets the range for the y axis
(lower_bound,upper_bound)
EXAMPLES
>> cf.datagen.heatmap().iplot(kind='heatmap')
>> cf.datagen.heatmap().iplot(kind='heatmap',colorscale='rdbu',center_scale=50)
In [28]:
cf.colors.scales()
accent
blues
brbg
bugn
bupu
dark2
dflt
ggplot
gnbu
greens
greys
henanigans
oranges
original
orrd
paired
pastel1
pastel2
piyg
plotly
polar
prgn
pubu
pubugn
puor
purd
purples
rdbu
rdgy
rdpu
rdylbu
rdylgn
reds
set1
set2
set3
spectral
ylgn
ylgnbu
ylorbr
ylorrd
In [29]:
# 2. plotly.graph_objects로 그려보기
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Heatmap(
x = doc2.index,
y = doc2.columns,
z = doc2,
colorscale = 'Reds'
)
)
10. 데이터 상관관계 알아보기(Scatter Plot)¶
In [30]:
# iplot으로 그려보기
doc.iplot(kind = 'scatter', x = 'Recovered', y = 'Confirmed', mode = 'markers')
In [31]:
# 2. plotly.graph_objects로 산점도 그려보기
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Scatter(
x = doc['Recovered'], y = doc['Confirmed'], mode = 'markers'
)
)
'데이터분석 > Pandas' 카테고리의 다른 글
| [Pandas] plotly 사용해서 시각화 해보기 2 (0) | 2021.07.18 |
|---|---|
| [Pandas] plotly 사용해서 시각화 해보기 1 (0) | 2021.07.18 |
| [Pandas] 데이터 처리 연습2 결과물 시각화 (0) | 2021.07.17 |
| [Pandas] 데이터 처리 연습 2 (0) | 2021.07.17 |
| [Pandas] 데이터 처리 연습 (0) | 2021.07.15 |