In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
1. 위치 추정과 범위 추정¶
- 위치 추정 : 데이터의 대푯값으로 feature의 특징 추정
EDA 방법에서는 대푯값으로 중앙값(Median)을 많이 씀
- 변이 추정 : 데이터의 분포로 feature의 특징 추정
표준편차, 분산이 대표적은 분포 확인 값
In [2]:
import pandas as pd
df = pd.DataFrame({
'A' : [1, 2, 3, 4, 5, 6],
'B' : [1, 2, 3, 4, 5, 100]
})
df
Out[2]:
A | B | |
---|---|---|
0 | 1 | 1 |
1 | 2 | 2 |
2 | 3 | 3 |
3 | 4 | 4 |
4 | 5 | 5 |
5 | 6 | 100 |
In [3]:
df.describe() # Dataframe의 다섯 수치 요약 확인 가능
Out[3]:
A | B | |
---|---|---|
count | 6.000000 | 6.000000 |
mean | 3.500000 | 19.166667 |
std | 1.870829 | 39.625329 |
min | 1.000000 | 1.000000 |
25% | 2.250000 | 2.250000 |
50% | 3.500000 | 3.500000 |
75% | 4.750000 | 4.750000 |
max | 6.000000 | 100.000000 |
3. 수치형 데이터의 요약(탐색)¶
- 다섯 수치 요약 (5 number summary) 확인하기
- 최소값(minimum), 제1사분위수, 중간값(mediam)=제2사분위수, 제3사분위수, 최대값(maximum) 확인하기
- 분위수(quartile)
- 자료 크기 순서에 따른 위치값(경계값)
- EDA 에서는 사분위수를 사용
- Q1: 제1사분위수 (25%에 해당하는 값)
- Q2: 제2사분위수 (50%에 해당하는 값, 중간값)
- Q3: 제3사분위수 (75%에 해당하는 값)
- Q4: 제4사분위수 (100%에 해당하는 값, 최대값)
Box Plot으로 Outlier 확인해보기¶
In [4]:
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True) # 주피터 노트북에서 바로 Plotting 해보기위한 code
In [5]:
# iplot 으로 확인해보기
df.iplot(kind = 'box')
# 확인 결과 B Data의 분포가 이상함 (Outlier 는 표기되지 않음)
In [6]:
# 2. plotly.graph_objects로 그려보기
In [7]:
import plotly.graph_objects as go
import plotly.offline as pyo # jupyter notebook에서 보여지도록 설정하는 부분
pyo.init_notebook_mode()
In [8]:
import plotly.graph_objects as go
fig = go.Figure() # fig 객체 선언
fig.add_trace( # fig 객체에 y = df['A'] , name = 'A' 인 Plot 할당
go.Box(
y = df['A'], name = 'A'
)
)
fig.add_trace( # fig 객체에 y = df['B'] , name = 'B' 인 Plot 할당
go.Box(
y = df['B'], name = 'B'
)
)
fig.show()
# print(fig)
4. 수치형 데이터 분포 확인¶
- 도수분포표 (frequency table): 수치형 데이터를 구간으로 나눠서 각 구간에 속하는 데이터의 갯수를 나타내는 표
- 히스토그램 (histogram) 그래프: 도수 분포표를 시각적으로 표현한 그래프
In [9]:
import pandas as pd
import numpy as np
# numpy rand 함수 사용해서 0 ~ 1범위의 난수 1만개 생성
df = pd.DataFrame(np.random.rand(10000, 1), columns = ['A'])
df.head()
Out[9]:
A | |
---|---|
0 | 0.820261 |
1 | 0.272102 |
2 | 0.978598 |
3 | 0.736549 |
4 | 0.000674 |
In [10]:
# 1. iplot으로 히스토그램 그려보기
df.iplot(kind = 'histogram', bins = 10) # 히스토그램 구간 10개로
In [11]:
# 2. plotly.graph_objects로 그려보기
import plotly.graph_objects as go
fig = go.Figure() # fig 객체 생성
fig.add_trace( # fig 객체에 그래프 할당?
go.Histogram( # fig 객체에 Histogram 그래프 정의
x = df['A'], name = 'A', # x = df['A'], 이름은 A로
xbins = dict(
start = 0, # x는 0에서 시작 1에서 끝
end = 1.0,
size = 0.05 # 0 ~ 1 구간을 0.05 간격으로 분할
),
marker_color = '#F50057'
)
)
fig.update_layout(
title_text = 'Sampled Results',
xaxis_title_text = 'Value',
yaxis_title_text = 'Count',
bargap = 0.1
)
fig.show()
5. 범주형 데이터 분석¶
In [12]:
data = {
'year' : ['2017', '2017', '2019', '2020', '2021', '2021'],
'grade' : ['C', 'C', 'B', 'A', 'B', 'E'],
}
print(type(data))
df = pd.DataFrame(data)
print(type(df))
<class 'dict'> <class 'pandas.core.frame.DataFrame'>
In [13]:
# groupby 로 feature별로 묶어서 확인해보기
df1 = df.groupby('year').count()
df2 = df.groupby('grade').count()
In [14]:
df1 # year를 기준으로, year에 매칭된 grade 갯수 표현
Out[14]:
grade | |
---|---|
year | |
2017 | 2 |
2019 | 1 |
2020 | 1 |
2021 | 2 |
In [15]:
df2 # grade 를 기준으로, grade에 매칭된 year 갯수 표현
Out[15]:
year | |
---|---|
grade | |
A | 1 |
B | 2 |
C | 2 |
E | 1 |
6. 범주형 데이터 분석에 주로 사용되는 데이터¶
- 막대 그래프 (절대빈도)
- 원 그래프 (상대빈도)
In [16]:
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True)
import pandas as pd
In [17]:
data = {
'year' : ['2017', '2017', '2019', '2020', '2021', '2021'],
'grade' : ['C', 'C', 'B', 'A', 'B', 'E'],
}
df = pd.DataFrame(data)
df1 = df.groupby('year').count()
df2 = df.groupby('grade').count()
In [18]:
# 1. iplot으로 막대그래프
df.iplot(kind = 'bar')
In [19]:
# 2. plotly.graph_objects로 그려보기
fig = go.Figure() # Figure 객체 선언
fig.add_trace(
go.Bar(
x = df2.index, y = df2['year'], name = 'A'
)
)
7. 테이블 데이터와 시계열 데이터 다뤄보기¶
- 데이터 테이블 : 행과 열로 이루어진 데이터
- feature : 테이블의 각 열을 의미
- record : 테이블의 각 행을 의미
- index : 각 데이터 위치를 식별하기 위한 값
In [20]:
# 시계열 데이터 생성
pd.date_range(start = '2021-01-01', end = '2021-12-31')
Out[20]:
DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10', ... '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25', '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29', '2021-12-30', '2021-12-31'], dtype='datetime64[ns]', length=365, freq='D')
In [21]:
# 3일 간격으로 시계열 데이터 만들기
pd.date_range(start = '2021-01-01', end = '2021-12-31', periods = 3)
# start ~ end 사이의 시간을 3개로 나눔
Out[21]:
DatetimeIndex(['2021-01-01', '2021-07-02', '2021-12-31'], dtype='datetime64[ns]', freq=None)
pandas Dataframe 작성 방법 정리¶
- pd.DataFrame(data={컬럼이름:컬럼데이터리스트})
- {컬럼이름:컬럼데이터리스트} 는 사전 데이터 형식
- pd.DataFrame(data=리스트, columns=컬럼이름, index=인덱스데이터)
In [22]:
import numpy as np
import pandas as pd
date_index = pd.date_range('2021-01-01', periods = 15) # 15개의 데이터를 하루 간격으로
df = pd.DataFrame(data = range(len(date_index)), columns = ['count'], index = date_index)
df
Out[22]:
count | |
---|---|
2021-01-01 | 0 |
2021-01-02 | 1 |
2021-01-03 | 2 |
2021-01-04 | 3 |
2021-01-05 | 4 |
2021-01-06 | 5 |
2021-01-07 | 6 |
2021-01-08 | 7 |
2021-01-09 | 8 |
2021-01-10 | 9 |
2021-01-11 | 10 |
2021-01-12 | 11 |
2021-01-13 | 12 |
2021-01-14 | 13 |
2021-01-15 | 14 |
8. 시계열 데이터 다뤄보기¶
- 라인 그래프
- 막대 그래프
In [23]:
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected = True)
# iplot으로 그려보기
df.iplot(kind = 'line')
In [24]:
# plotly.graph_objects로 그려보기
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Scatter(
x = df.index, y = df['count'], mode = 'lines + markers'
)
)
9. 데이터간 상관관계 확인해보기 (Heatmap)¶
In [25]:
import chart_studio.plotly as py
import cufflinks as cf
import pandas as pd
cf.go_offline(connected = True)
doc = pd.read_csv('../COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/04-01-2020.csv', encoding='utf-8-sig')
# df.head()
doc2 = df.corr()
doc2
Out[25]:
count | |
---|---|
count | 1.0 |
In [26]:
# 1. iplot으로 heatmap 그려보기
doc2.iplot(kind = 'heatmap', colorscale = 'ylorrd')
In [27]:
cf.help('heatmap')
HEATMAP Heatmap Parameters: =========== center_scale : float Centers the colorscale at a specific value Automatically sets the (zmin,zmax) values zmax : float Defines the maximum range for the z values This affects the range for the colorscale zmin : float Defines the minimum range for the z values This affects the range for the colorscale colors : dict, list or string Trace color string : applies to all traces list : applies to each trace in the order specified dict : {column:value} for each column in the dataframe values colorname : see cufflinks.colors.cnames hex : '#ffffff' rgb : 'rgb(23,50,23)' rgba : 'rgba(23,50,23,.5) colorscale : string Color scale name If the color is preceded by a minus (-) then the scale is inversed. Only valid if 'colors' is null. see cufflinks.colors.scales() for all available scales data : figure Plotly Figure rangeslider : bool or dict Defines if a range slider is displayed. If True : displays a range slider If dict : defines a range slider object Example: {'bgcolor':('blue',.3),'autorange':True} text : string Name of the column that contains the text values width : int, list or dict Line width int : applies to all traces list : applies to each trace in the order specified dict : {column:value} for each column in the dataframe ANNOTATIONS annotations : dict Dictionary of annotations {x_point : text} fontcolor : string Text color fontsize : int Text size textangle : int Text angle EXPORTS asFigure : bool If True then it returns a Plotly Figure asImage : bool If True then it returns an image (PNG) While in ONLINE mode: Image file is saved in the working directory Accepts: filename dimensions scale display_image While in OFFLINE mode: Image file is downloaded (downloads folder) and a regular plotly chart is displayed in Jupyter Accepts: filename dimensions asPlot : bool If True then the chart opens in a browser asURL : bool If True the chart url/path is returned. No chart is displayed. If ONLINE : The URL is returned If OFFLINE : the local path is returned display_image : bool If True, then the image is displayed after being saved. Only valid if 'asImage=True' filename : string Filename to be saved as online : bool If True then the chart/image is rendered on the server even when running in Offline mode scale : int Increase the resolution of the image by `scale` amount Only valid if 'asImage=True' sharing : string Sets the sharing level permission public - anyone can see the chart private - only you can see this chart secret - only people with the link can see the chart LAYOUT layout : Plotly Layout If defined, this Layout is explicitly used for the Figure generation dimensions : tuple Dimensions for image/chart (width,height) fontfamily : string HTML Font typeface that will be applied It needs to exist on the system on which it operates. Examples: 'Times New Roman' 'Open Sans' 'Monospace' gridcolor : string Sets the grid color colorname : see cufflinks.colors.cnames hex : '#ffffff' rgb : 'rgb(23,50,23)' rgba : 'rgba(23,50,23,.5) legend : string Defines where the legend should appear Values: bottom top margin : dict or tuple Sets the margin dimensions {'l':left,'r':right,'b':bottom,'t':top} (left,right,bottom,top) showlegend : bool Defines if the legend should appear theme : string Layout theme solar pearl white see cufflinks.getThemes() for all available themes title : string Chart title xTitle : string X Axis Title yTitle : string Y Axis Title zerolinecolor : string Sets the zero line color colorname : see cufflinks.colors.cnames hex : '#ffffff' rgb : 'rgb(23,50,23)' rgba : 'rgba(23,50,23,.5) layout_update : dict The Layout will be explicitly modified with the values stated in the dictionary. Not valid when Layout is passed as a parameter SHAPES hline : float, list or dict Draws a horizontal line at the indicated 'y' position(s). Extra parameters can be passed in the form of a dictionary (see 'shapes') hline=4 hline=[2,10] hline=[{'y':2,'color':'blue'},{'y':3,'color':'red'}] hspan : tuple, list or dict Draws a horizontal rectangle at the indicated (y0,y1) positions. Extra parameters can be passed in the form of a dictionary (see 'shapes') hspan=(1,5) hspan=[(1,4),(6,10)] hspan=[{'y0':2,'y1':5,'color':'blue','fill':True,'opacity':.4}] vline : float, list or dict Draws a vertical line at the indicated 'x' position(s). Extra parameters can be passed in the form of a dictionary (see 'shapes') vline=4 vline=[2,10] vline=[{'x':'2015-02-08','color':'blue'},{'x':'2015-03-08','color':'red'}] vspan : tuple, list or dict Draws a vertical rectangle at the indicated (x0,x1) positions. Extra parameters can be passed in the form of a dictionary (see 'shapes') vspan=('2015-02-08','2015-03-08') vspan=[(1,4),(6,10)] vspan=[{'x0':2,'x1':5,'color':'blue','fill':True,'opacity':.4}] shapes : list or dict List of dictionaries with the specification of a given shape. For more information see help(cufflinks.tools.get_shape) SUBPLOTS horizontal_spacing : float [0-1] Space between subplot columns shape : (int,int) Indicates the size of rows and columns. If ommitted, then the shape is automatically set * Only valid if subplots=True (rows,columns) shared_xaxes : bool If True, subplots in the same grid column have one common shared x-axis at the bottom of the grid. shared_yaxes : bool If True, subplots in the same grid row have one common shared y-axis at the left of the grid. subplot_titles : bool If True, chart titles are displayed at the top of each subplot. subplots : bool If True then each trace is placed in a subplot vertical_spacing : float [0-1] Space between subplot rows AXIS logx : bool Sets the x axis to be of logarithmic scale logy : bool Sets the y axis to be of logarithmic scale logz : bool Sets the z axis to be of logarithmic scale xrange : tuple Sets the range for the x axis (lower_bound,upper_bound) yrange : tuple Sets the range for the y axis (lower_bound,upper_bound) EXAMPLES >> cf.datagen.heatmap().iplot(kind='heatmap') >> cf.datagen.heatmap().iplot(kind='heatmap',colorscale='rdbu',center_scale=50)
In [28]:
cf.colors.scales()
accent
blues
brbg
bugn
bupu
dark2
dflt
ggplot
gnbu
greens
greys
henanigans
oranges
original
orrd
paired
pastel1
pastel2
piyg
plotly
polar
prgn
pubu
pubugn
puor
purd
purples
rdbu
rdgy
rdpu
rdylbu
rdylgn
reds
set1
set2
set3
spectral
ylgn
ylgnbu
ylorbr
ylorrd
In [29]:
# 2. plotly.graph_objects로 그려보기
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Heatmap(
x = doc2.index,
y = doc2.columns,
z = doc2,
colorscale = 'Reds'
)
)
10. 데이터 상관관계 알아보기(Scatter Plot)¶
In [30]:
# iplot으로 그려보기
doc.iplot(kind = 'scatter', x = 'Recovered', y = 'Confirmed', mode = 'markers')
In [31]:
# 2. plotly.graph_objects로 산점도 그려보기
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(
go.Scatter(
x = doc['Recovered'], y = doc['Confirmed'], mode = 'markers'
)
)
'데이터분석 > Pandas' 카테고리의 다른 글
[Pandas] plotly 사용해서 시각화 해보기 2 (0) | 2021.07.18 |
---|---|
[Pandas] plotly 사용해서 시각화 해보기 1 (0) | 2021.07.18 |
[Pandas] 데이터 처리 연습2 결과물 시각화 (0) | 2021.07.17 |
[Pandas] 데이터 처리 연습 2 (0) | 2021.07.17 |
[Pandas] 데이터 처리 연습 (0) | 2021.07.15 |