Object creation (2024-01-26)¶

In [ ]:

import numpy as np
import pandas as pd

In [ ]:

s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

Out[ ]:

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [ ]:

dates = pd.date_range("20130101", periods=6)
dates

Out[ ]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [ ]:

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Out[ ]:

	A	B	C	D
2013-01-01	0.278327	0.645447	0.621123	-0.067120
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-03	-0.008112	1.272442	0.294107	-0.389570
2013-01-04	0.751519	-0.346030	0.384487	-0.803443
2013-01-05	-0.008458	1.078977	0.502768	0.985506
2013-01-06	-0.486277	-1.506802	-0.864283	1.469331

In [ ]:

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1,index=list(range(4)),dtype="float32"),
        "D": np.array([3]*4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", #특이한게 ,로 끝내도 되고 아닌 것으로 끝내도 되네?
    }
)
df2

Out[ ]:

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo

In [ ]:

df2.dtypes

Out[ ]:

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [ ]:

#df2.<TAB> # noqa: E225, E999

#df2.까지 입력후 탭누르라는 의미, vscode에서는 탭안눌러도 자동으로 표시 단 컬럼은 안 표시됨

Viewing data (2024-01-27)¶

In [ ]:

df.head()

Out[ ]:

	A	B	C	D
2013-01-01	0.278327	0.645447	0.621123	-0.067120
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-03	-0.008112	1.272442	0.294107	-0.389570
2013-01-04	0.751519	-0.346030	0.384487	-0.803443
2013-01-05	-0.008458	1.078977	0.502768	0.985506

In [ ]:

df.tail(3)

Out[ ]:

	A	B	C	D
2013-01-04	0.751519	-0.346030	0.384487	-0.803443
2013-01-05	-0.008458	1.078977	0.502768	0.985506
2013-01-06	-0.486277	-1.506802	-0.864283	1.469331

In [ ]:

df.index

Out[ ]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [ ]:

df.columns

Out[ ]:

Index(['A', 'B', 'C', 'D'], dtype='object')

In [ ]:

df.to_numpy()

Out[ ]:

array([[ 0.27832749,  0.64544663,  0.62112283, -0.06711988],
       [ 0.41189372,  1.10123621,  0.254361  , -0.25003483],
       [-0.0081116 ,  1.27244212,  0.29410743, -0.38957036],
       [ 0.75151886, -0.34602993,  0.38448688, -0.80344264],
       [-0.00845814,  1.07897669,  0.50276799,  0.98550603],
       [-0.48627682, -1.50680214, -0.86428265,  1.46933149]])

In [ ]:

df.values

Out[ ]:

array([[ 0.27832749,  0.64544663,  0.62112283, -0.06711988],
       [ 0.41189372,  1.10123621,  0.254361  , -0.25003483],
       [-0.0081116 ,  1.27244212,  0.29410743, -0.38957036],
       [ 0.75151886, -0.34602993,  0.38448688, -0.80344264],
       [-0.00845814,  1.07897669,  0.50276799,  0.98550603],
       [-0.48627682, -1.50680214, -0.86428265,  1.46933149]])

In [ ]:

df2.dtypes

Out[ ]:

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [ ]:

df2.to_numpy()

Out[ ]:

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [ ]:

df2.values

Out[ ]:

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [ ]:

df.describe()

Out[ ]:

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.156482	0.374212	0.198761	0.157445
std	0.424638	1.092026	0.538061	0.877010
min	-0.486277	-1.506802	-0.864283	-0.803443
25%	-0.008372	-0.098161	0.264298	-0.354686
50%	0.135108	0.862212	0.339297	-0.158577
75%	0.378502	1.095671	0.473198	0.722350
max	0.751519	1.272442	0.621123	1.469331

In [ ]:

df.T
#행과 열을 바꿔주는데 미리 계산되어 있음
#메소드가 아닌 속성임

Out[ ]:

	2013-01-01	2013-01-02	2013-01-03	2013-01-04	2013-01-05	2013-01-06
A	0.278327	0.411894	-0.008112	0.751519	-0.008458	-0.486277
B	0.645447	1.101236	1.272442	-0.346030	1.078977	-1.506802
C	0.621123	0.254361	0.294107	0.384487	0.502768	-0.864283
D	-0.067120	-0.250035	-0.389570	-0.803443	0.985506	1.469331

In [ ]:

df.T.index
#값만 바꿔서 나타내는가해서 확인용으로 해줬는데 인덱스도 확실히 바뀌어있다

Out[ ]:

Index(['A', 'B', 'C', 'D'], dtype='object')

In [ ]:

df.sort_index(axis=1, ascending=False)
#axis = 1 하면 열끼리 정렬하는 것

Out[ ]:

	D	C	B	A
2013-01-01	-0.067120	0.621123	0.645447	0.278327
2013-01-02	-0.250035	0.254361	1.101236	0.411894
2013-01-03	-0.389570	0.294107	1.272442	-0.008112
2013-01-04	-0.803443	0.384487	-0.346030	0.751519
2013-01-05	0.985506	0.502768	1.078977	-0.008458
2013-01-06	1.469331	-0.864283	-1.506802	-0.486277

In [ ]:

df.sort_values(by="B")
#df안에 있는 특정 열의 값에 대해서 정렬해줄 수 있음

Out[ ]:

	A	B	C	D
2013-01-06	-0.486277	-1.506802	-0.864283	1.469331
2013-01-04	0.751519	-0.346030	0.384487	-0.803443
2013-01-01	0.278327	0.645447	0.621123	-0.067120
2013-01-05	-0.008458	1.078977	0.502768	0.985506
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-03	-0.008112	1.272442	0.294107	-0.389570

Selection (2024-01-28)¶

Getitem ([])¶

In [ ]:

df["A"]

Out[ ]:

2013-01-01    0.278327
2013-01-02    0.411894
2013-01-03   -0.008112
2013-01-04    0.751519
2013-01-05   -0.008458
2013-01-06   -0.486277
Freq: D, Name: A, dtype: float64

In [ ]:

df[0:3]

Out[ ]:

	A	B	C	D
2013-01-01	0.278327	0.645447	0.621123	-0.067120
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-03	-0.008112	1.272442	0.294107	-0.389570

In [ ]:

df["20130102":"20130104"]

Out[ ]:

	A	B	C	D
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-03	-0.008112	1.272442	0.294107	-0.389570
2013-01-04	0.751519	-0.346030	0.384487	-0.803443

Selection by label¶

In [ ]:

df.loc[dates[0]]

Out[ ]:

A    0.278327
B    0.645447
C    0.621123
D   -0.067120
Name: 2013-01-01 00:00:00, dtype: float64

In [ ]:

df.loc[dates[0],:]

Out[ ]:

A    0.278327
B    0.645447
C    0.621123
D   -0.067120
Name: 2013-01-01 00:00:00, dtype: float64

In [ ]:

df.loc[:,["A","B"]]

Out[ ]:

	A	B
2013-01-01	0.278327	0.645447
2013-01-02	0.411894	1.101236
2013-01-03	-0.008112	1.272442
2013-01-04	0.751519	-0.346030
2013-01-05	-0.008458	1.078977
2013-01-06	-0.486277	-1.506802

In [ ]:

df.loc["20130102":"20130104",["A","B"]]

Out[ ]:

	A	B
2013-01-02	0.411894	1.101236
2013-01-03	-0.008112	1.272442
2013-01-04	0.751519	-0.346030

In [ ]:

df.loc["20130102":"20130104",["B","A"]]

Out[ ]:

	B	A
2013-01-02	1.101236	0.411894
2013-01-03	1.272442	-0.008112
2013-01-04	-0.346030	0.751519

In [ ]:

df.loc[dates[0],"A"]

Out[ ]:

0.2783274883042243

In [ ]:

df.at[dates[0],"A"]

Out[ ]:

0.2783274883042243

Selection by position¶

In [ ]:

df.iloc[3]

Out[ ]:

A    0.751519
B   -0.346030
C    0.384487
D   -0.803443
Name: 2013-01-04 00:00:00, dtype: float64

In [ ]:

df.iloc[3:5, 0:2]

Out[ ]:

	A	B
2013-01-04	0.751519	-0.346030
2013-01-05	-0.008458	1.078977

In [ ]:

#비교 loc는 슬라이싱 할때 끝부분까지로 인식(끝부분 포함됨) iloc는 끝부분제외
#df.loc[3:5, 0:2] 아 맞다 이런 식으로 숫자 인덱스로 안됬다
df.loc["20130104":"20130105", "A":"B"]

Out[ ]:

	A	B
2013-01-04	0.751519	-0.346030
2013-01-05	-0.008458	1.078977

In [ ]:

df.iloc[[1,2,4],[0,2]]

Out[ ]:

	A	C
2013-01-02	0.411894	0.254361
2013-01-03	-0.008112	0.294107
2013-01-05	-0.008458	0.502768

In [ ]:

df.iloc[1:3, :]

Out[ ]:

	A	B	C	D
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-03	-0.008112	1.272442	0.294107	-0.389570

In [ ]:

df.iloc[:, 1:3]

Out[ ]:

	B	C
2013-01-01	0.645447	0.621123
2013-01-02	1.101236	0.254361
2013-01-03	1.272442	0.294107
2013-01-04	-0.346030	0.384487
2013-01-05	1.078977	0.502768
2013-01-06	-1.506802	-0.864283

In [ ]:

df.iloc[1,1]

Out[ ]:

1.1012362078351068

In [ ]:

df.iat[1,1]

Out[ ]:

1.1012362078351068

Boolean indexing (조건에 따른 컬럼 선택)¶

In [ ]:

df[df["A"] > 0]

Out[ ]:

	A	B	C	D
2013-01-01	0.278327	0.645447	0.621123	-0.067120
2013-01-02	0.411894	1.101236	0.254361	-0.250035
2013-01-04	0.751519	-0.346030	0.384487	-0.803443

In [ ]:

df[df>0]

Out[ ]:

	A	B	C	D
2013-01-01	0.278327	0.645447	0.621123	NaN
2013-01-02	0.411894	1.101236	0.254361	NaN
2013-01-03	NaN	1.272442	0.294107	NaN
2013-01-04	0.751519	NaN	0.384487	NaN
2013-01-05	NaN	1.078977	0.502768	0.985506
2013-01-06	NaN	NaN	NaN	1.469331

In [ ]:

df2=df.copy()
df2["E"]=["one", "one", "two", "three", "four", "three"]
df2

Out[ ]:

	A	B	C	D	E
2013-01-01	0.278327	0.645447	0.621123	-0.067120	one
2013-01-02	0.411894	1.101236	0.254361	-0.250035	one
2013-01-03	-0.008112	1.272442	0.294107	-0.389570	two
2013-01-04	0.751519	-0.346030	0.384487	-0.803443	three
2013-01-05	-0.008458	1.078977	0.502768	0.985506	four
2013-01-06	-0.486277	-1.506802	-0.864283	1.469331	three

In [ ]:

df2[df2["E"].isin(["two", "four"])]
#isin()안에 []로 넣어야하는 듯 (한 개의 경우에도)

Out[ ]:

	A	B	C	D	E
2013-01-03	-0.008112	1.272442	0.294107	-0.389570	two
2013-01-05	-0.008458	1.078977	0.502768	0.985506	four

Setting¶

In [ ]:

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range("20130102", periods=6))
s1

Out[ ]:

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [ ]:

df["F"]=s1

In [ ]:

df.at[dates[0],"A"] = 0
df

Out[ ]:

	A	B	C	D	F
2013-01-01	0.000000	0.645447	0.621123	-0.067120	NaN
2013-01-02	0.411894	1.101236	0.254361	-0.250035	1.0
2013-01-03	-0.008112	1.272442	0.294107	-0.389570	2.0
2013-01-04	0.751519	-0.346030	0.384487	-0.803443	3.0
2013-01-05	-0.008458	1.078977	0.502768	0.985506	4.0
2013-01-06	-0.486277	-1.506802	-0.864283	1.469331	5.0

In [ ]:

df.iat[0, 1] = 0
df

Out[ ]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	0.621123	-0.067120	NaN
2013-01-02	0.411894	1.101236	0.254361	-0.250035	1.0
2013-01-03	-0.008112	1.272442	0.294107	-0.389570	2.0
2013-01-04	0.751519	-0.346030	0.384487	-0.803443	3.0
2013-01-05	-0.008458	1.078977	0.502768	0.985506	4.0
2013-01-06	-0.486277	-1.506802	-0.864283	1.469331	5.0

In [ ]:

len(df)

Out[ ]:

In [ ]:

df.loc[:, "D"] = np.array([5]*len(df))
df

Out[ ]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	0.621123	5.0	NaN
2013-01-02	0.411894	1.101236	0.254361	5.0	1.0
2013-01-03	-0.008112	1.272442	0.294107	5.0	2.0
2013-01-04	0.751519	-0.346030	0.384487	5.0	3.0
2013-01-05	-0.008458	1.078977	0.502768	5.0	4.0
2013-01-06	-0.486277	-1.506802	-0.864283	5.0	5.0

In [ ]:

df2 = df.copy()
df2[df2 > 0] = -df2
df2

Out[ ]:

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.621123	-5.0	NaN
2013-01-02	-0.411894	-1.101236	-0.254361	-5.0	-1.0
2013-01-03	-0.008112	-1.272442	-0.294107	-5.0	-2.0
2013-01-04	-0.751519	-0.346030	-0.384487	-5.0	-3.0
2013-01-05	-0.008458	-1.078977	-0.502768	-5.0	-4.0
2013-01-06	-0.486277	-1.506802	-0.864283	-5.0	-5.0

Missing data (2024-01-29)¶

In [ ]:

df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+["E"])
df.loc[dates[0]:dates[1], "E"] = 1
df1

Out[ ]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	0.621123	5.0	NaN	NaN
2013-01-02	0.411894	1.101236	0.254361	5.0	1.0	NaN
2013-01-03	-0.008112	1.272442	0.294107	5.0	2.0	NaN
2013-01-04	0.751519	-0.346030	0.384487	5.0	3.0	NaN

In [ ]:

df1.dropna(how="any") #결측치 있으면 다 제거하는 옵션인듯?

Out[ ]:

	A	B	C	D	F	E

In [ ]:

df1.fillna(value=5) #결측치를 특정 값으로 다 바꿔서 채우는 방법

Out[ ]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	0.621123	5.0	5.0	5.0
2013-01-02	0.411894	1.101236	0.254361	5.0	1.0	5.0
2013-01-03	-0.008112	1.272442	0.294107	5.0	2.0	5.0
2013-01-04	0.751519	-0.346030	0.384487	5.0	3.0	5.0

In [ ]:

pd.isna(df1) #결측치가 있는가에 대한 T/F판단 있으면 T

Out[ ]:

	A	B	C	D	F	E
2013-01-01	False	False	False	False	True	True
2013-01-02	False	False	False	False	False	True
2013-01-03	False	False	False	False	False	True
2013-01-04	False	False	False	False	False	True

Operations (2024-01-30)¶

Stats¶

In [ ]:

df

Out[ ]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	0.621123	5.0	NaN	1.0
2013-01-02	0.411894	1.101236	0.254361	5.0	1.0	1.0
2013-01-03	-0.008112	1.272442	0.294107	5.0	2.0	NaN
2013-01-04	0.751519	-0.346030	0.384487	5.0	3.0	NaN
2013-01-05	-0.008458	1.078977	0.502768	5.0	4.0	NaN
2013-01-06	-0.486277	-1.506802	-0.864283	5.0	5.0	NaN

In [ ]:

df.mean() #NaN값있으면 합산, 갯수에서 제외하고 평균 내는 듯

Out[ ]:

A    0.110094
B    0.266637
C    0.198761
D    5.000000
F    3.000000
E    1.000000
dtype: float64

In [ ]:

print(df.mean(axis=1)) 
# 마찬가지로 NaN값있으면 합산, 갯수에서 제외하고 평균 내는 듯, 단 axis=1이면 행 별로 모든 컬럼의 값의 평균 내줌
# axis = 1말고 그냥 인자로 1만 줘도 동일한 결과가 나오는 듯 함

2013-01-01    1.324225
2013-01-02    1.461248
2013-01-03    1.711688
2013-01-04    1.757995
2013-01-05    2.114657
2013-01-06    1.428528
Freq: D, dtype: float64

In [ ]:

s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates)
s

Out[ ]:

2013-01-01    1.0
2013-01-02    3.0
2013-01-03    5.0
2013-01-04    NaN
2013-01-05    6.0
2013-01-06    8.0
Freq: D, dtype: float64

In [ ]:

s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

Out[ ]:

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html#pandas.Series
Series의 메소드인 shift가 뭐하는 녀석인지 몰라서 문서에 들어가보니 아래와 같이 서술되어있었다
shift([periods, freq, axis, fill_value, suffix])
Shift index by desired number of periods with an optional time freq.
자세한 내용은 모르겠지만 대강 값들을 index를 기준으로 준 값만큼 이동시키는 듯하다
1일과 2일의 경우 이전 인덱스에 해당하는 데이터가 없으니 NaN로 결측치가 된 것이고
3,4,5일의 경우 인덱스상으로 2이전의 값들이 존재했으니 해당값들로 바뀌었고
6일의 경우 인덱스상으로 2이전의 값이 NaN로 결측치 였기에 결측치로 되었는 듯하다.

In [ ]:

df.sub(s, axis='index')

Out[ ]:

	A	B	C	D	F	E
2013-01-01	NaN	NaN	NaN	NaN	NaN	NaN
2013-01-02	NaN	NaN	NaN	NaN	NaN	NaN
2013-01-03	-1.008112	0.272442	-0.705893	4.0	1.0	NaN
2013-01-04	-2.248481	-3.346030	-2.615513	2.0	0.0	NaN
2013-01-05	-5.008458	-3.921023	-4.497232	0.0	-1.0	NaN
2013-01-06	NaN	NaN	NaN	NaN	NaN	NaN

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html#pandas.DataFrame 이번엔 df의 메소드인 sub에 대하여 설명이 없어서 찾아보니 아래와 같이 메소드로 설명되어있었고 잘 이해가 되지 않아 좀더 자세한 독스에 들어갔다
sub(other[, axis, level, fill_value])
Get Subtraction of dataframe and other, element-wise (binary operator sub).
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sub.html#pandas.DataFrame.sub
DataFrame.sub(other, axis='columns', level=None, fill_value=None)
대강 구조를 보니 df에 대하여 빼기를 해주는데 sub()안에 들어있는 값으로 빼주는 것으로 들어올 수 있는 other로는 scalar, sequence, Series, dict or DataFrame이 가능하다고 한다 이중 sequence는 정확히 뭔지 모르겠어서 찾아보니
https://wikidocs.net/84391
내용을 참고하였다 대강 리스트, 문자열, 튜플 이 세가지 자료형이라고 생각하면 될듯하다
그리고 axis에 대하여 0 or index , 1 or columns의 옵션이 있는데 기본값은 colunms로 되어있다
fill_value는 결측치를 만났을 때 특정값으로 바꿔넣어줄 것인지 선택하는 옵션으로 기본값은 none로 되어있다
level 옵션에 대해서는 잘모르겠다 예제도 한문제 뿐인데 잘 이해가 되지 않는다 대강 멀티인덱스를 사용할 경우에 사용하는 느낌인데 예제를 아래 그대로 가져와서 해보고 이해해보도록 하겠다

level 예제¶

In [ ]:

level_example_df = pd.DataFrame({'angles': [0, 3, 4],
                   'degrees': [360, 180, 360]},
                  index=['circle', 'triangle', 'rectangle'])
level_example_df

Out[ ]:

	angles	degrees
circle	0	360
triangle	3	180
rectangle	4	360

In [ ]:

level_example_df_multindex = pd.DataFrame({'angles': [0, 3, 4, 4, 5, 6],
                             'degrees': [360, 180, 360, 360, 540, 720]},
                            index=[['A', 'A', 'A', 'B', 'B', 'B'],
                                   ['circle', 'triangle', 'rectangle',
                                    'square', 'pentagon', 'hexagon']])
level_example_df_multindex

Out[ ]:

		angles	degrees
A	circle	0	360
	triangle	3	180
	rectangle	4	360
B	square	4	360
	pentagon	5	540
	hexagon	6	720

In [ ]:

print(level_example_df.div(level_example_df_multindex, level=1))
#원래 예제에는 ,fill_value=0으로 결측치 0으로 바꿔주는 옵션이 추가되어있었다
#뭔가 알듯말듯한데 chatGPT를 통해 설명을 들으니 알 것 같았다
#https://chat.openai.com/share/73654015-96b4-4fb3-b6e3-243526afb670

             angles  degrees
A circle        NaN      1.0
  triangle      1.0      1.0
  rectangle     1.0      1.0
B square        NaN      NaN
  pentagon      NaN      NaN
  hexagon       NaN      NaN

User defined functions¶

In [ ]:

display(df)
print(df.mean(),'\n')
print(df.mean()*5.6)

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	0.621123	5.0	NaN	1.0
2013-01-02	0.411894	1.101236	0.254361	5.0	1.0	1.0
2013-01-03	-0.008112	1.272442	0.294107	5.0	2.0	NaN
2013-01-04	0.751519	-0.346030	0.384487	5.0	3.0	NaN
2013-01-05	-0.008458	1.078977	0.502768	5.0	4.0	NaN
2013-01-06	-0.486277	-1.506802	-0.864283	5.0	5.0	NaN

A    0.110094
B    0.266637
C    0.198761
D    5.000000
F    3.000000
E    1.000000
dtype: float64 

A     0.616528
B     1.493168
C     1.113059
D    28.000000
F    16.800000
E     5.600000
dtype: float64

In [ ]:

df.agg(lambda x: np.mean(x)*5.6)

Out[ ]:

A     0.616528
B     1.493168
C     1.113059
D    28.000000
F    16.800000
E     5.600000
dtype: float64

In [ ]:

df*101.2

Out[ ]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	62.857630	506.0	NaN	101.2
2013-01-02	41.683644	111.445104	25.741333	506.0	101.2	101.2
2013-01-03	-0.820894	128.771142	29.763671	506.0	202.4	NaN
2013-01-04	76.053709	-35.018228	38.910072	506.0	303.6	NaN
2013-01-05	-0.855964	109.192441	50.880120	506.0	404.8	NaN
2013-01-06	-49.211214	-152.488377	-87.465405	506.0	506.0	NaN

In [ ]:

df.transform(lambda x: x*101.2)

Out[ ]:

	A	B	C	D	F	E
2013-01-01	0.000000	0.000000	62.857630	506.0	NaN	101.2
2013-01-02	41.683644	111.445104	25.741333	506.0	101.2	101.2
2013-01-03	-0.820894	128.771142	29.763671	506.0	202.4	NaN
2013-01-04	76.053709	-35.018228	38.910072	506.0	303.6	NaN
2013-01-05	-0.855964	109.192441	50.880120	506.0	404.8	NaN
2013-01-06	-49.211214	-152.488377	-87.465405	506.0	506.0	NaN

agg는 대강은 알듯한데 transform은 agg와 뭐가 다른지 모르겠다
추가로 유사한 것에 apply도 있던 것으로 기억하는데 여기는 왜 빠져있고 차이가 뭔지도 궁금하다
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.agg.html#pandas.DataFrame.agg
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transform.html#pandas.DataFrame.transform
https://pandas.pydata.org/pandas-docs/stable/user_guide/gotchas.html#gotchas-udf-mutation
chatGPT에게 차이를 물어본 내용: https://chat.openai.com/share/654feb01-8d50-481e-8c7a-ca160e1e74a4

Value Counts¶

In [ ]:

s = pd.Series(np.random.randint(0,7,size=10))
s

Out[ ]:

0    2
1    6
2    5
3    2
4    4
5    3
6    3
7    0
8    5
9    5
dtype: int32

In [ ]:

s.value_counts()

Out[ ]:

5    3
2    2
3    2
6    1
4    1
0    1
Name: count, dtype: int64

String Methods¶

In [ ]:

s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
print(s)
s.str.lower()

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

Out[ ]:

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

pandas.Series: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html#pandas.Series
Vectorized String Methods: https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-string-methods

Merge (2024-01-31)¶

Concat¶

In [ ]:

df = pd.DataFrame(np.random.randn(10,4)) 
#np.random.randn(m,n) : 평균0, 표준편차1의 가우시안 표준정규분포 난수를 matrix array(m,n) 생성
df

Out[ ]:

	0	1	2	3
0	-1.700155	-0.317837	0.347565	-2.268710
1	-0.661258	0.718931	-0.244838	-2.039249
2	-0.144064	-0.654301	1.647890	-0.281090
3	-0.115725	-0.007579	-1.199584	-0.526254
4	-0.488054	0.521904	-0.279547	0.080345
5	-0.198320	1.062847	0.525446	-0.112844
6	-1.368007	0.701267	1.235764	-0.998641
7	-1.220871	1.183501	1.355200	-0.126375
8	-1.540007	-0.821376	-0.216658	-0.176388
9	-0.703783	0.192384	-1.320056	0.587904

np.random.randn(m,n) https://nittaku.tistory.com/443
docs : https://numpy.org/doc/stable/reference/random/generated/numpy.random.randn.html

In [ ]:

pieces = [df[:3], df[3:7], df[7:]]
pieces

Out[ ]:

[          0         1         2         3
 0 -1.700155 -0.317837  0.347565 -2.268710
 1 -0.661258  0.718931 -0.244838 -2.039249
 2 -0.144064 -0.654301  1.647890 -0.281090,
           0         1         2         3
 3 -0.115725 -0.007579 -1.199584 -0.526254
 4 -0.488054  0.521904 -0.279547  0.080345
 5 -0.198320  1.062847  0.525446 -0.112844
 6 -1.368007  0.701267  1.235764 -0.998641,
           0         1         2         3
 7 -1.220871  1.183501  1.355200 -0.126375
 8 -1.540007 -0.821376 -0.216658 -0.176388
 9 -0.703783  0.192384 -1.320056  0.587904]

In [ ]:

pd.concat(pieces)

Out[ ]:

	0	1	2	3
0	-1.700155	-0.317837	0.347565	-2.268710
1	-0.661258	0.718931	-0.244838	-2.039249
2	-0.144064	-0.654301	1.647890	-0.281090
3	-0.115725	-0.007579	-1.199584	-0.526254
4	-0.488054	0.521904	-0.279547	0.080345
5	-0.198320	1.062847	0.525446	-0.112844
6	-1.368007	0.701267	1.235764	-0.998641
7	-1.220871	1.183501	1.355200	-0.126375
8	-1.540007	-0.821376	-0.216658	-0.176388
9	-0.703783	0.192384	-1.320056	0.587904

Join¶

In [ ]:

left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
left

Out[ ]:

	key	lval
0	foo	1
1	foo	2

In [ ]:

right

Out[ ]:

	key	rval
0	foo	4
1	foo	5

In [ ]:

pd.merge(left, right, on="key")

Out[ ]:

	key	lval	rval
0	foo	1	4
1	foo	1	5
2	foo	2	4
3	foo	2	5

In [ ]:

left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]})
left

Out[ ]:

	key	lval
0	foo	1
1	bar	2

In [ ]:

right

Out[ ]:

	key	rval
0	foo	4
1	bar	5

In [ ]:

pd.merge(left, right, on="key")

Out[ ]:

	key	lval	rval
0	foo	1	4
1	bar	2	5

Grouping (2024-02-01)¶

Splitting the data into groups based on some criteria

Applying a function to each group independently

Combining the results into a data structure

In [ ]:

df = pd.DataFrame(
    {"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], 
     "B": ["one", "one", "two", "three", "two", "two", "one", "three"], 
     "C": np.random.randn(8),
     "D": np.random.randn(8),}
)
df

Out[ ]:

	A	B	C	D
0	foo	one	0.138702	-0.470424
1	bar	one	-0.831213	2.143912
2	foo	two	0.595156	-0.315410
3	bar	three	-1.870080	0.370945
4	foo	two	-0.471530	0.238094
5	bar	two	-0.631674	-0.700090
6	foo	one	0.968913	-0.954628
7	foo	three	-0.068441	-0.648986

In [ ]:

df.groupby("A")[["C", "D"]].sum()

Out[ ]:

	C	D
A
bar	-3.332967	1.814767
foo	1.162800	-2.151353

In [ ]:

df.groupby(["A", "B"]).sum()

Out[ ]:

		C	D
A	B
bar	one	-0.831213	2.143912
	three	-1.870080	0.370945
	two	-0.631674	-0.700090
foo	one	1.107615	-1.425052
	three	-0.068441	-0.648986
	two	0.123626	-0.077316

Reshaping (2024-02-02)¶

Stack¶

In [ ]:

arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], 
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]
index = pd.MultiIndex.from_arrays(arrays, names = ["first", "second"])
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=["A", "B"])
df2 = df[:4]
df2

Out[ ]:

		A	B
first	second
bar	one	-0.676465	0.464446
bar	two	-1.332632	-0.253777
baz	one	1.547494	-0.184863
baz	two	2.161207	1.346286

In [ ]:

stacked = df2.stack() #future_stack=True
stacked #왜인지는 모르겠지만 저 옵션 빼줘야 제대로 실행됨 결과는 그리 다르지 않은 것으로 보임 나중에 질문해보기

Out[ ]:

first  second   
bar    one     A   -0.676465
               B    0.464446
       two     A   -1.332632
               B   -0.253777
baz    one     A    1.547494
               B   -0.184863
       two     A    2.161207
               B    1.346286
dtype: float64

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.stack.html#pandas.DataFrame.stack
부분을 보니 pandas2.0에서 pandas 3.0의 방식을 사용할 때 해주던 방식으로 유추되는데

In [ ]:

pd.__version__

Out[ ]:

'2.0.3'

2.0버전 쓰고 있는 것이 맞는데 왜 안되는 거지..?

In [ ]:

stacked.unstack()

Out[ ]:

		A	B
first	second
bar	one	-0.676465	0.464446
bar	two	-1.332632	-0.253777
baz	one	1.547494	-0.184863
baz	two	2.161207	1.346286

In [ ]:

stacked.unstack(1) # 행열 바꿔서 해제하는듯?

Out[ ]:

	second	one	two
first
bar	A	-0.676465	-1.332632
bar	B	0.464446	-0.253777
baz	A	1.547494	2.161207
baz	B	-0.184863	1.346286

In [ ]:

stacked.unstack(0) # 무슨 방식으로 바꾸는지는 모르겠지만 째든 바꿔서 해제하는듯

Out[ ]:

	first	bar	baz
second
one	A	-0.676465	1.547494
one	B	0.464446	-0.184863
two	A	-1.332632	2.161207
two	B	-0.253777	1.346286

Pivot tables¶

In [ ]:

df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Out[ ]:

	A	B	C	D	E
0	one	A	foo	-0.807974	-1.302463
1	one	B	foo	1.290553	-0.334357
2	two	C	foo	-0.475818	0.133969
3	three	A	bar	1.196717	0.865346
4	one	B	bar	-0.227697	-1.362118
5	one	C	bar	-0.038830	1.381041
6	two	A	foo	2.163772	-0.897080
7	three	B	foo	-2.336250	-0.953737
8	one	C	foo	0.216059	0.169161
9	one	A	bar	-0.126712	1.120767
10	two	B	bar	-0.304438	0.750880
11	three	C	bar	0.453993	-1.094076

In [ ]:

pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Out[ ]:

	C	bar	foo
A	B
one	A	-0.126712	-0.807974
	B	-0.227697	1.290553
	C	-0.038830	0.216059
three	A	1.196717	NaN
	B	NaN	-2.336250
	C	0.453993	NaN
two	A	NaN	2.163772
	B	-0.304438	NaN
	C	NaN	-0.475818

자동으로 commit되게 yml추가해준 것 테스트용도

ABOUT ME

kyeob 개발일지 kyeob 개발일지

Object creation (2024-01-26)¶

Viewing data (2024-01-27)¶

Selection (2024-01-28)¶

Getitem ([])¶

Selection by label¶

Selection by position¶

Boolean indexing (조건에 따른 컬럼 선택)¶

Setting¶

Missing data (2024-01-29)¶

Operations (2024-01-30)¶

Stats¶

level 예제¶

User defined functions¶

Value Counts¶

String Methods¶

Merge (2024-01-31)¶

Concat¶

Join¶

Grouping (2024-02-01)¶

Reshaping (2024-02-02)¶

Stack¶

Pivot tables¶

'코딩 공부 > Pandas' 카테고리의 다른 글

티스토리툴바

Pandas와 친해지기(10분 Pandas) (2024-02-07) (1)	2024.02.07
Pandas와 친해지기(10분 Pandas) (2024-02-06) (1)	2024.02.06
Pandas와 친해지기(10분 Pandas) (2024-02-05) (0)	2024.02.05
Pandas와 친해지기(10분 Pandas) (2024-02-04) (0)	2024.02.04
Pandas와 친해지기(10분 Pandas) (2024-02-03) (0)	2024.02.03

ABOUT ME

Object creation (2024-01-26)¶

Viewing data (2024-01-27)¶

Selection (2024-01-28)¶

Getitem ([])¶

Selection by label¶

Selection by position¶

Boolean indexing (조건에 따른 컬럼 선택)¶

Setting¶

Missing data (2024-01-29)¶

Operations (2024-01-30)¶

Stats¶

level 예제¶

User defined functions¶

Value Counts¶

String Methods¶

Merge (2024-01-31)¶

Concat¶

Join¶

Grouping (2024-02-01)¶

Reshaping (2024-02-02)¶

Stack¶

Pivot tables¶

'코딩 공부 > Pandas' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바