In [1]:
import pandas as pd
import numpy as np
In [2]:
#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','James','Ricky','Vin','Steve','Smith','Jack',
'Lee','David','Gasper','Betina','Andres']),
'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}
#Create a DataFrame
df = pd.DataFrame(d)
print (df)
In [3]:
print (df.sum())
In [4]:
print (df.sum(1))
In [5]:
print (df.mean())
In [6]:
print (df.std())
In [7]:
print (df.describe())
In [8]:
print (df.describe(include=['object']))
print (df.describe(include=['number'])) #all
In [9]:
print (df.describe(include='all'))
In [10]:
##Table-wise operation
def adder(ele1,ele2):
return ele1+ele2
df_tablewise = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
print("Original table : \n {} " .format(df_tablewise))
print("\n After adding 2 by tablewise operation")
print (df_tablewise.pipe(adder,2))
In [11]:
## Row or Column Wise Function Application##
df_columnwise = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
print (df_columnwise)
print (df_columnwise.apply(np.mean)) #provided that data should be numbers
#print (df.apply(np.mean))
In [12]:
df_rowwise = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
print (df_rowwise)
print (df_rowwise.apply(np.mean, axis=1)) # axis=0 for columnwise nad axis =1 for rowwise
#print (df.apply(np.mean))
In [13]:
df_elementwise = pd.DataFrame(np.random.randn(5,3),columns=['col1','col2','col3'])
print (df_elementwise)
# My custom function
op=df_elementwise.applymap(lambda x:x*100)
print (op)
print (op.apply(np.mean))
df_elementwise['col1'].map(lambda x:x*100)
Out[13]:
In [14]:
import pandas as pd
import numpy as np
N=20
df_reindex = pd.DataFrame({
'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
'x': np.linspace(0,stop=N-1,num=N),
'y': np.random.rand(N),
'C': np.random.choice(['Low','Medium','High'],N).tolist(),
'D': np.random.normal(100, 10, size=(N)).tolist()
})
print(df_reindex)
#reindex the DataFrame
df_reindexed =df_reindex.reindex(index=[0,2,5], columns=['A', 'C', 'B'])
print (df_reindexed)
In [15]:
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])
print(df1)
print(df2)
df1 = df1.reindex_like(df2)
print (df1)
In [16]:
df1 = pd.DataFrame(np.random.randn(10,3),columns=['col1','col2','col3'])
df2 = pd.DataFrame(np.random.randn(7,3),columns=['col1','col2','col3'])
# Padding NAN's
print (df2.reindex_like(df1))
# Now Fill the NAN's with preceding Values
print ("Data Frame with Forward Fill:")
print (df2.reindex_like(df1,method='bfill', limit=1)) #limit 1 indicate fill only 1 row
In [17]:
df1 = pd.DataFrame(np.random.randn(6,3),columns=['col1','col2','col3'])
print (df1)
print ("After renaming the rows and columns:")
print (df1.rename(columns={'col1' : 'c1', 'col2' : 'c2'},
index = {0 : 'apple', 1 : 'banana', 2 : 'durian'}))
In [18]:
## Three methods 1. iteritems, iterrows, itertuples
N=20
df = pd.DataFrame({
'A': pd.date_range(start='2016-01-01',periods=N,freq='D'),
'x': np.linspace(0,stop=N-1,num=N),
'y': np.random.rand(N),
'C': np.random.choice(['Low','Medium','High'],N).tolist(),
'D': np.random.normal(100, 10, size=(N)).tolist()
})
print(df)
for col in df:
print (col)
In [19]:
### Row wise iterate
df = pd.DataFrame(np.random.randn(4,3),columns=['col1','col2','col3'])
print(df)
for key,value in df.iteritems():
print (key,"\n",value)
#print (type(value)) ## returns pandas series
In [20]:
df = pd.DataFrame(np.random.randn(4,3),columns = ['col1','col2','col3'])
for row_index,row in df.iterrows():
print (row_index,row)
#print (type(row)) # pandas series
In [21]:
df = pd.DataFrame(np.random.randn(4,3),columns = ['col1','col2','col3'])
for row in df.itertuples():
print (row)
#print(type(row)) returns list
In [22]:
##Python Pandas - Sorting
## methods 1. by index 2. by actual values
unsorted_df=pd.DataFrame(np.random.randn(10,2),index=[1,4,6,2,3,5,9,8,0,7],columns=['col2','col1'])
print (unsorted_df)
In [23]:
sorted_df=unsorted_df.sort_index()
print (sorted_df)
In [24]:
##order of sorting
sorted_df = unsorted_df.sort_index(ascending=False)
print (sorted_df)
In [25]:
print (unsorted_df)
sorted_df=unsorted_df.sort_index(axis=1)# by default axis=0 hence column wise sorting
print (sorted_df)
In [26]:
## sorting by value
unsorted_df = pd.DataFrame({'col1':[2,1,1,1],'col2':[1,3,2,4]})
sorted_df = unsorted_df.sort_values(by='col1')
print (unsorted_df)
print (sorted_df)
sorted_df = unsorted_df.sort_values(by='col1' ,kind='mergesort')
print (sorted_df)
In [27]:
###Pandas provide API e.g displays its relevant functions are:
## get_option()set_option() reset_option() describe_option() option_context()
## Default max values for row and column display
print (pd.get_option("display.max_rows"))
print (pd.get_option("display.max_columns"))
print("\n")
pd.set_option("display.max_rows",90)
pd.set_option("display.max_columns",45)
print (pd.get_option("display.max_rows"))
print (pd.get_option("display.max_columns"))
print("\n")
pd.reset_option("display.max_rows") ## reset to default values
pd.reset_option("display.max_columns") ## reset to default values
print (pd.get_option("display.max_rows"))
print (pd.get_option("display.max_columns"))
print("\n")
with pd.option_context("display.max_rows",10): ##temporory set
print(pd.get_option("display.max_rows"))
print(pd.get_option("display.max_columns"))
In [28]:
## Label based
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])
print(df)
#select all rows for a specific column
print (df.loc[:,'A'])
# Select all rows for multiple columns, say list[]
print (df.loc[:,['A','C']])
# Select few rows for multiple columns, say list[]
print (df.loc[['a','b','f','h'],['A','C']])
# Select range of rows for all columns
print (df.loc['a':'f'])
# for getting values with a boolean array
print (df.loc['a']>0)
In [29]:
#integer based. #Like python and numpy, these are 0-based indexing.
df = pd.DataFrame(np.random.randn(8, 4), columns = ['A', 'B', 'C', 'D'])
print(df)
# select all rows for a specific column
print (df.iloc[:4])
print (df.iloc[1:5, 2:4])
# Slicing through list of values
print (df.iloc[[1, 3, 5], [1, 3]])
print (df.iloc[1:3, :])
print (df.iloc[:,1:3])
print("\n hybrid approach")
##hyrid approach ## depreceated warning, use loc and iloc only
# Integer slicing
print (df.ix[:3]) ## considers 3rd element also. strange but reality ##
# Index slicing
print (df.ix[:,'A'])
In [30]:
## use of notations
print (df[['A','B']])
## attribute access
print (df.A)
In [31]:
###Python Pandas - Statistical Functions
s = pd.Series([12,3,4,5,4])
print (s.pct_change()) ## percent change w.r.t. previous element/100
df = pd.DataFrame(np.random.randn(5, 2))
print (df)
print (df.pct_change(axis=0)) # by default axis=0 for column wise for row wise axis=1
In [32]:
###Co-variance
s1 = pd.Series(np.random.randn(10))
s2 = pd.Series(np.random.randn(10))
print (s1.cov(s2))
In [33]:
frame = pd.DataFrame(np.random.randn(10, 5), columns=['a', 'b', 'c', 'd', 'e'])
print (frame['a'].cov(frame['b'])) #covariance between a and b series
print (frame.cov()) #complete covariance
In [34]:
## co-relation
print (frame['a'].corr(frame['b']))
print (frame.corr())
In [35]:
## python windows function
##Window functions are majorly used in finding the trends within the data
df = pd.DataFrame(np.random.randn(10, 4),
index = pd.date_range('1/1/2000', periods=10),
columns = ['A', 'B', 'C', 'D'])
print(df)
print("\n after rolling window with size 3")
print (df.rolling(window=3).mean())
print("\n after expanding min period 3")
print (df.expanding(min_periods=3).mean())
print("\n after assigining weight exponentially")
print (df.ewm(com=0.5).mean())
In [36]:
### Missing values NaN
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f',
'h'],columns=['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print (df)
## check for missing values
## two methos notnull isnull
print (df['one'].isnull())
print (df['one'].notnull())
print("\n sum: ")
print (df['one'].sum()) # treats NaN values as zero
# If data are all NA, then result will b NA
### Cleaning / Filling Missing Data
print (df.fillna(0)) # fill nan with zeros
## pad/fill --> Fill methods Forward fill
## bfill/backfill --> Fill methods Backward
print (df.fillna(method='bfill'))
print (df.dropna()) ## drop mising values
print (df.dropna(axis=1))
In [37]:
## replacing missing or generic values
df = pd.DataFrame({'one':[10,20,30,40,50,2000],
'two':[1000,0,30,40,50,60]})
print (df.replace({1000:10,2000:60}))
In [38]:
### Split Data into Groups ##Groupby ## remaining to study
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)
print(df)
print ('\n result')
print (df.groupby('Team'))
print (df.groupby('Team').groups)
In [39]:
##Python Pandas - Merging/Joining
left = pd.DataFrame({
'id':[1,2,3,4,5],
'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'subject_id':['sub1','sub2','sub4','sub6','sub5']})
right = pd.DataFrame(
{'id':[1,2,3,4,5],
'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'subject_id':['sub2','sub4','sub3','sub6','sub5']})
print (left)
print (right)
print (pd.merge(left,right,on='id'))
In [40]:
print (pd.merge(left,right,on=['id','subject_id'])) ## here id n sub id matches merged
In [41]:
#Methods of merging how= left right, outer, inner
print (pd.merge(left, right, on=['id','subject_id'], how='left'))
In [42]:
## Concat two dataframes
one = pd.DataFrame({
'Name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'subject_id':['sub1','sub2','sub4','sub6','sub5'],
'Marks_scored':[98,90,87,69,78]},
index=[1,2,3,4,5])
two = pd.DataFrame({
'Name': ['Billy', 'Brian', 'Bran', 'Bryce', 'Betty'],
'subject_id':['sub2','sub4','sub3','sub6','sub5'],
'Marks_scored':[89,80,79,97,88]},
index=[1,2,3,4,5])
print (pd.concat([one,two]))
print("\n concat with keys")
print (pd.concat([one,two],keys=['x','y']))
In [43]:
#resultant object has to follow its own indexing, set ignore_index to True.
print (pd.concat([one,two],keys=['x','y'],ignore_index=True))
print (pd.concat([one,two],axis=1)) ## concat along rows
print (one.append(two)) ## concat using append and default axis=0
In [44]:
print (pd.datetime.now()) ## remaining part here
In [45]:
### Pandas categorical ### overlook remaining
s = pd.Series(["a","b","c","a"], dtype="category")
print (s)
In [46]:
### Visualization pandas
df = pd.DataFrame(np.random.randn(10,4),index=pd.date_range('1/1/2000',
periods=10), columns=list('ABCD'))
%matplotlib inline
df.plot()
df.plot.bar()
df.plot.bar(stacked=True)
df.plot.hist(bins=20)
df.plot.box()
Out[46]:
In [47]:
df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df.plot.area()
df.plot.scatter(x='a', y='b')
#df.diff.hist(bins=20) ## not working
df = pd.DataFrame(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], columns=['x'])
df.plot.pie(subplots=True)
Out[47]:
In [48]:
df=pd.read_csv("temp.csv")
#df=pd.read_csv("temp.csv",index_col=['S.No'])
print (df.columns)
In [49]:
df = pd.read_csv("temp.csv", dtype={'DistanceRaw2GPS': np.float64}) ##Converters
print (df.dtypes)
In [50]:
df=pd.read_csv("temp.csv", names=['GPS_Latitude', 'DistanceRaw2GPS'])
df=pd.read_csv("temp.csv", skiprows=2) #skip initial 2 rows
#df
In [51]:
## sparcify data
ts = pd.Series(np.random.randn(10))
ts[2:-2] = np.nan # from 2 nd row to last 2 rows nan the values
sts = ts.to_sparse()
print (sts)
print (sts.to_dense())
#print("Total score for \n ", ts, "is \n", sts)
### Python Pandas - Caveats & Gotchas part remaining
No comments:
Post a Comment