Pandas Land

Introduction

This is Panda exercises:

Data Frames

In this section, I explore how to create data frames from different ways:

  • list
  • dictionary
  • Json
  • csv file

In addition to that basic Data Frame (DF) manipulations:

import pandas as pd

cols = { 'name': ['Ted', 'Mak', 'Nina','Leo']
        ,'age' : [50, 20, 33, 25] }

l_students= [{'name':'Tailor', 'grade':'10','math':60}
            ,{'name':'Lora', 'grade':'09','math':80}
            ,{'name':'Joe', 'grade':'11','math':56.90}
            ,{'name':'Tailor', 'grade':'11','math':68.98} ]

studemtDF = pd.DataFrame(l_students)

# read from the json
import json
json_students = json.dumps(l_students)
# [
#   {
#     "grade": "10",
#     "name": "Tailor",
#     "math": 60
#   },
#   {
#     "grade": "09",
#     "name": "Lora",
#     "math": 80
#   },
#   {
#     "grade": "11",
#     "name": "Joe",
#     "math": 56.9
#   },
#   {
#     "grade": "11",
#     "name": "Tailor",
#     "math": 68.98
#   }
# ]
json_std_df = pd.read_json(json_students)
#   grade   math    name
# 0 10  60.00   Tailor
# 1 9   80.00   Lora
# 2 11  56.90   Joe
# 3 11  68.98   Tailor

json_cols = json.dumps(cols)
# {
#   "age": [
#     50,
#     20,
#     33,
#     25
#   ],
#   "name": [
#     "Ted",
#     "Mak",
#     "Nina",
#     "Leo"
#   ]
# }
json_cols_df = pd.read_json(json_cols)
#   age name
# 0 50  Ted
# 1 20  Mak
# 2 33  Nina
# 3 25  Leo

peopleDF = pd.DataFrame(cols)
# age   name
# 0 50  Ted
# 1 20  Mak
# 2 33  Nina
# 3 25  Leo

peopleDF.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 2 columns):
# age     4 non-null int64
# name    4 non-null object
# dtypes: int64(1), object(1)
# memory usage: 136.0+ bytes

peopleDF.columns
#Index([u'age', u'name'], dtype='object')

peopleDF.age
# 0    50
# 1    20
# 2    33
# 3    25
# Name: age, dtype: int64

#or
peopleDF['age']

#describe
peopleDF.describe()
# age
# count 4.000000
# mean  32.000000
# std   13.140269
# min   20.000000
# 25%   23.750000
# 50%   29.000000
# 75%   37.250000
# max   50.000000

#create new column to existing df
peopleDF['city'] = pd.Series(['Sydney','Canberra', 'Melbourne', 'Darwin'])
#  age  name    city
# 0 50  Ted Sydney
# 1 20  Mak Canberra
# 2 33  Nina    Melbourne
# 3 25  Leo Darwin

peopleDF['eat'] = pd.Series(['rice','bread','bread','meat'],index=[3,1,0,6])
#   age name    city    eat
# 0 50  Ted Sydney  bread
# 1 20  Mak Canberra    bread
# 2 33  Nina    Melbourne   NaN
# 3 25  Leo Darwin  rice

#broadcasting
peopleDF['country'] = 'Australia'
#   age name    city    eat country
# 0 50  Ted Sydney  bread   Australia
# 1 20  Mak Canberra    bread   Australia
# 2 33  Nina    Melbourne   NaN Australia
# 3 25  Leo Darwin  rice    Australia

#slice DataFrame
counDF = peopleDF[['name', 'country']]
#   name    country
# 0 Ted Australia
# 1 Mak Australia
# 2 Nina    Australia
# 3 Leo Australia

del counDF['name']
#     country
# 0 Australia
# 1 Australia
# 2 Australia
# 3 Australia

#pop out the age, this mutate the peopleDF
peopleDF.pop('age')
#   name    city    eat country
# 0 Ted Sydney  bread   Australia
# 1 Mak Canberra    bread   Australia
# 2 Nina    Melbourne   NaN Australia
# 3 Leo Darwin  rice    Australia

Selection and Projection

import pandas as pd

cols = { 'name': ['Ted', 'Mak', 'Nina','Leo']
        ,'age' : [50, 20, 33, 25] }
df = pd.DataFrame(cols)


df.name == 'Nina'
# 0    False
# 1    False
# 2     True
# 3    False
# Name: name, dtype: bool

#masking
df[df.name == 'Nina']
# age   name
# 2 33  Nina

df.name.str.contains('e')
# 0     True
# 1    False
# 2    False
# 3     True
# Name: name, dtype: bool

df[(df.name.str.contains('e')) & (df.age < 30)]
#   age name
# 3 25  Leo
# df[(df.name.str.contains('e')) and (df.age < 30)] will not work because python 'and'.

#from numexpr (pip install numexpr if not available. My case it was by default)
df.query('age > 25')
#   age name
# 0 50  Ted
# 2 33  Nina

# and <==> &
df.query('age > 25 and name == "Ted"')
#   age name
# 0 50  Ted

# or <==> |
df.query('age > 25 | name == "Leo"')
# age   name
# 0 50  Ted
# 2 33  Nina
# 3 25  Leo

# how not
~(df.name == 'Nina')
# 0     True
# 1     True
# 2    False
# 3     True
# Name: name, dtype: bool
(df.name != 'Nina') # same as above
df[~(df.name == 'Nina')]
# age   name
# 0 50  Ted
# 1 20  Mak
# 3 25  Leo
df.query('not name == "Nina"') #same as above

#get last two rows
df.tail(2)
# age   name
# 2 33  Nina
# 3 25  Leo

# get first two rows
df.head(2)
# age   name
# 0 50  Ted
# 1 20  Mak
# or
df[:2] #slicing

#position based: get first and last
df.iloc[[0,3]]
# age   name
# 0 50  Ted
# 3 25  Leo

#projection
df[['name', 'age']]
# name  age
# 0 Ted 50
# 1 Mak 20
# 2 Nina    33
# 3 Leo 25

#position based projection
df.iloc[:2, :1] # two records with second column
# age
# 0 50
# 1 20


df.loc[:2, ['name']]
# name
# 0 Ted
# 1 Mak
# 2 Nina

Merge

import pandas as pd

cols_age = { 'name': ['Ted', 'Mak', 'Nina','Leo',"Phips"]
        ,'age' : [50, 20, 33, 25,40] }
df_age = pd.DataFrame(cols_age)

cols_marks = { 'name': ['Ted', 'Mak', 'Nina','Leo','Rino']
        ,'marks' : [78.50, 80.90, 33.89, 25, 67] }

df_marks = pd.DataFrame(cols_marks)

pd.merge(df_age,df_marks, on='name', how='inner')
#   age name    marks
# 0 50  Ted 78.50
# 1 20  Mak 80.90
# 2 33  Nina    33.89
# 3 25  Leo 25.00

pd.merge(df_marks,df_age, on='name')
#   marks   name    age
# 0 78.50   Ted 50
# 1 80.90   Mak 20
# 2 33.89   Nina    33
# 3 25.00   Leo 25

pd.merge(df_age,df_marks, on='name', how='left')
#   age name    marks
# 0 50  Ted 78.50
# 1 20  Mak 80.90
# 2 33  Nina    33.89
# 3 25  Leo 25.00
# 4 40  Phips   NaN

pd.merge(df_age,df_marks, on='name', how='right')
#   age name    marks
# 0 50.0    Ted 78.50
# 1 20.0    Mak 80.90
# 2 33.0    Nina    33.89
# 3 25.0    Leo 25.00
# 4 NaN Rino    67.00

pd.merge(df_age,df_marks, on='name', how='outer')
#   age name    marks
# 0 50.0    Ted 78.50
# 1 20.0    Mak 80.90
# 2 33.0    Nina    33.89
# 3 25.0    Leo 25.00
# 4 40.0    Phips   NaN
# 5 NaN Rino    67.00

#copy df
df_fname = df_marks.copy(deep=True)
df_fname = df_fname.rename(columns={'name':'fname'})

pd.merge(df_fname,df_age, left_on='fname', right_on='name')
#   marks   fname   age name
# 0 78.50   Ted 50  Ted
# 1 80.90   Mak 20  Mak
# 2 33.89   Nina    33  Nina
# 3 25.00   Leo 25  Leo

Let’s do more with the merge operation similar to database join

import numpy as np
import pandas as pd

df_emp = pd.DataFrame({
    'name':['Alex','Clara','Dany','John','Philip','Marko'],
    'age':[45,35,23,39,43,21],
    'comp_id':['001','002','003','004','001','004']
    })

df_comp = pd.DataFrame({
    'comp_id':['001','002','003','004'],
    'state':['ACT','ACT','NSW','NSW'],
    'name':['DHS','IMMG','ABC','Ticketek']
    })

#concat
pd.concat([df_emp,df_comp])
#   age comp_id name       state
# 0 45.0    001 Alex       NaN
# 1 35.0    002 Clara      NaN
# 2 23.0    003 Dany       NaN
# 3 39.0    004 John       NaN
# 4 43.0    001 Philip     NaN
# 5 21.0    004 Marko      NaN
# 0 NaN     001 DHS        ACT
# 1 NaN     002 IMMG       ACT
# 2 NaN     003 ABC        NSW
# 3 NaN     004 Ticketek   NSW

pd.concat([df_emp,df_comp],verify_integrity=True)
#error because index overlapping
#therefore ingnore the index if necessary
pd.concat([df_emp,df_comp],ignore_index=True)

pd.concat([df_emp,df_comp],join='inner')
#   comp_id name
# 0 001 Alex
# 1 002 Clara
# 2 003 Dany
# 3 004 John
# 4 001 Philip
# 5 004 Marko
# 0 001 DHS
# 1 002 IMMG
# 2 003 ABC
# 3 004 Ticketek

pd.concat([df_emp,df_comp], axis=1)
#     age   comp_id name    comp_id name    state
# 0 45  001 Alex    001     DHS             ACT
# 1 35  002 Clara   002     IMMG            ACT
# 2 23  003 Dany    003     ABC             NSW
# 3 39  004 John    004     Ticketek        NSW
# 4 43  001 Philip  NaN     NaN             NaN
# 5 21  004 Marko   NaN     NaN             NaN

pd.concat([df_emp,df_comp], axis=1, join='inner')
#   age comp_id name    comp_id name    state
# 0 45  001      Alex   001     DHS             ACT
# 1 35  002      Clara  002     IMMG            ACT
# 2 23  003      Dany   003     ABC             NSW
# 3 39  004      John   004     Ticketek        NSW

df_emp.merge(df_comp,how='inner', indicator=True, 
    left_on='comp_id', right_on='comp_id', suffixes=('_emp', '_comp'))
#   age comp_id     name_emp    name_comp   state   _merge
# 0 45  001         Alex        DHS         ACT     both
# 1 43  001         Philip      DHS         ACT     both
# 2 35  002         Clara       IMMG        ACT     both
# 3 23  003         Dany        ABC         NSW     both
# 4 39  004         John        Ticketek    NSW     both
# 5 21  004         Marko       Ticketek    NSW     both

df_emp.join(df_comp, lsuffix='_emp', rsuffix='_comp', on='comp_id', how='left')

Operaions to Data Frames

  • Adding, renaming and delete column
  • changes to categorical type
  • axes and sort operations
  • group by operation
  • append and concat of two DFs
  • iterate over columns and rows
import pandas as pd

'''
National Assessment Program – Literacy and Numeracy (NAPLAN) - 2010 data
URL: https://data.gov.au/dataset/education-national-assessment-program-literacy-and-numeracy-nsw
'''
df = pd.read_csv('./education/gender-Table 1.csv'
    , names=['Year','Gender','Reading', 'Numeracy']
    , skiprows=[0], usecols=[0,1,2,3]
    )
#     Year  Gender  Reading Numeracy
# 0 Year 3  Male    94.0    94.5
# 1 Year 3  Female  96.6    95.6
# 2 Year 5  Male    91.3    94.4
# 3 Year 5  Female  94.8    94.9
# 4 Year 7  Male    93.7    94.7
# 5 Year 7  Female  96.4    95.1
# 6 Year 9  Male    88.9    93.4
# 7 Year 9  Female  93.6    92.8

df.Gender.value_counts()
# Male      4
# Female    4
# Name: Gender, dtype: int64

#change to the categorical type
df['Gender'] = df.Gender.astype('category')
df.shape
#rows and cols : (8, 4)

df.axes
# [RangeIndex(start=0, stop=8, step=1),
#  Index([u'Year', u'Gender', u'Reading', u'Numeracy'], dtype='object')]

df.axes[0]
#RangeIndex(start=0, stop=8, step=1)
#or
df.index
df.axes[0] is df.index
#True

df.axes[1]
#Index([u'Year', u'Gender', u'Reading', u'Numeracy'], dtype='object')
df.sort_index(axis=1)

df.index.unique()
df.index.duplicated().any()
df.index.duplicated().all()

df.columns
df.columns.duplicated()

df.describe()
# Reading   Numeracy
# count 8.000000    8.000000
# mean  93.662500   94.425000
# std   2.557866    0.913001
# min   88.900000   92.800000
# 25%   93.025000   94.150000
# 50%   93.850000   94.600000
# 75%   95.200000   94.950000
# max   96.600000   95.600000

df.describe(include='all')
# Year  Gender  Reading Numeracy
# count 8   8   8.000000    8.000000
# unique    4   2   NaN NaN
# top   Year 3  Male    NaN NaN
# freq  2   4   NaN NaN
# mean  NaN NaN 93.662500   94.425000
# std   NaN NaN 2.557866    0.913001
# min   NaN NaN 88.900000   92.800000
# 25%   NaN NaN 93.025000   94.150000
# 50%   NaN NaN 93.850000   94.600000
# 75%   NaN NaN 95.200000   94.950000
# max   NaN NaN 96.600000   95.600000

df.groupby('Gender').sum()
#         Reading   Numeracy
# Gender
# Female    381.4   378.4
# Male  367.9   377.0

%matplotlib inline
ax = df['Reading'].plot(kind='box')

df_au = pd.read_csv('./education/australia-Table 1.csv'
    , names=['Year','Cat','Reading', 'Numeracy']
    , skiprows=[0], usecols=[0,1,2,3]
    )

df_ind = pd.read_csv('./education/indigenous-Table 1.csv'
    , names=['Year','Cat','Reading', 'Numeracy']
    , skiprows=[0], usecols=[0,1,2,3]
    )
df_au.append(df_ind)
#     Year  Cat Reading Numeracy
# 0 Year 3  Australia   93.9    94.3
# 1 Year 3  NSW 95.3    95.0
# 2 Year 5  Australia   91.3    93.7
# ...
# 0 Year 3  Aboriginal  85.5    83.9
# 1 Year 3  non-Aboriginal  95.7    95.6
# 2 Year 5  Aboriginal  77.7    80.9
# ...
# same above
pd.concat([df_au,df_ind])

#reset the index
df_nsw=pd.concat([df_au,df_ind],ignore_index=True)
df_nsw.iloc[1]
# Year        Year 3
# Cat            NSW
# Reading       95.3
# Numeracy        95
# Name: 1, dtype: object

#update existing
df_nsw.iloc[1] = ['Year 3','NSW',99,98]

#to add new
len(df)
#8

#add a record
df.loc[8] = ['Year 3','NSW',99,98]
#     Year  Gender  Reading Numeracy
# 0 Year 3  Male    94.0    94.5
# 1 Year 3  Female  96.6    95.6
# 2 Year 5  Male    91.3    94.4
# 3 Year 5  Female  94.8    94.9
# 4 Year 7  Male    93.7    94.7
# 5 Year 7  Female  96.4    95.1
# 6 Year 9  Male    88.9    93.4
# 7 Year 9  Female  93.6    92.8
# 8 Year 3  NaN 99.0    98.0

#add a new column. You will get an error if the list is in different dimension
df['given'] =[2,3,4,9,7,5,6,2,4]

#create a column with one default value
df['given2'] = 2
#or iterate such as
df.loc[2:5]
df.loc[:,'given3'] = '3'

#calculated column
df['avg']=(df['Reading']+df['Numeracy'])/2

#based on function
def tot(cols):
    read, nume = cols
    return read + nume
#this will create new series which can be added to the df
d = df[['Reading', 'Numeracy']].apply(tot,axis=1)

#filter the df
df[['Year','Reading']]
#but to mutate the df and delete the column
df.pop('given')
#same as above but not return the deleted column as above
del df['given2']

#rename the column
df.rename(columns={'given3': 'given'})

#rename the index

df.sort_index(ascending=False)
df.sort_values('Gender', ascending=False)
#    Year   Gender  Reading Numeracy    given3  avg
# 6 Year 9  Male    88.9    93.4    3   91.15
# 4 Year 7  Male    93.7    94.7    3   94.20
# 2 Year 5  Male    91.3    94.4    3   92.85
# 0 Year 3  Male    94.0    94.5    3   94.25
# 7 Year 9  Female  93.6    92.8    3   93.20
# 5 Year 7  Female  96.4    95.1    3   95.75
# 3 Year 5  Female  94.8    94.9    3   94.85
# 1 Year 3  Female  96.6    95.6    3   96.10
# 8 Year 3  NaN 99.0    98.0    3   98.50

df['given'] = [5,3,4,6,3,4,5,1,2]
df.sort_values(['given', 'Gender'], ascending=[False, False])

#column sort

df.sort_index(axis=1)
#     Gender    Numeracy    Reading Year
# 0 Male    94.5    94.0    Year 3
# 1 Female  95.6    96.6    Year 3
# 2 Male    94.4    91.3    Year 5
# 3 Female  94.9    94.8    Year 5
# 4 Male    94.7    93.7    Year 7
# 5 Female  95.1    96.4    Year 7
# 6 Male    93.4    88.9    Year 9
# 7 Female  92.8    93.6    Year 9

#if you iterate the data frame
list(df)
#['Year', 'Gender', 'Reading', 'Numeracy']
df.iteritems #iterate over column
# for example
for name, col in df.iteritems():
    print (name, col)
    break
# ('Year', 0    Year 3
# 1    Year 3
# 2    Year 5
# 3    Year 5
# 4    Year 7
# 5    Year 7
# 6    Year 9
# 7    Year 9
# Name: Year, dtype: object)

#to iterate over rows
for tup in df.itertuples():
    print(tup)
# Pandas(Index=0, Year='Year 3', Gender='Male', Reading=94.0, Numeracy=94.5)
# Pandas(Index=1, Year='Year 3', Gender='Female', Reading=96.599999999999994, Numeracy=95.599999999999994)
# Pandas(Index=2, Year='Year 5', Gender='Male', Reading=91.299999999999997, Numeracy=94.400000000000006)
# Pandas(Index=3, Year='Year 5', Gender='Female', Reading=94.799999999999997, Numeracy=94.900000000000006)
# Pandas(Index=4, Year='Year 7', Gender='Male', Reading=93.700000000000003, Numeracy=94.700000000000003)
# Pandas(Index=5, Year='Year 7', Gender='Female', Reading=96.400000000000006, Numeracy=95.099999999999994)
# Pandas(Index=6, Year='Year 9', Gender='Male', Reading=88.900000000000006, Numeracy=93.400000000000006)
# Pandas(Index=7, Year='Year 9', Gender='Female', Reading=93.599999999999994, Numeracy=92.799999999999997)

for row in df.iterrows():
    print(row)
# (0, Year        Year 3
# Gender        Male
# Reading         94
# Numeracy      94.5
# Name: 0, dtype: object)
# (1, Year        Year 3
# Gender      Female
# Reading       96.6
# Numeracy      95.6
# Name: 1, dtype: object)
# ...

#set index to other column
import string
df_au['num']= [x for x in string.lowercase[0:8]]
df_au =df_au.set_index('num')
#     Year  Cat Reading Numeracy
# num
# a Year 3  Australia   93.9    94.3
# b Year 3  NSW         95.3    95.0
# c Year 5  Australia   91.3    93.7
# d Year 5  NSW         93.0    94.7
# e Year 7  Australia   94.9    95.1
# f Year 7  NSW         95.0    94.9
# g Year 9  Australia   90.8    93.1
# h Year 9  NSW 91.2    93.1

#set the cell value: this will mutate the DF
df_au.set_value('d','Reading',99.89)
#     Year  Cat Reading Numeracy
# num
# ...
# c Year 5  Australia   91.30   93.7
# d Year 5  NSW         99.89   94.7
# e Year 7  Australia   94.90   95.1
# ...

#set cell using loc
df_au.loc['b','Reading'] = 96.0
#     Year  Cat Reading Numeracy
# num
# a Year 3  Australia   93.90   94.3
# b Year 3  NSW        96.00    95.0
# c Year 5  Australia   91.30   93.7
# ...
df_au.loc['i','Reading'] = 96.0 #be careful not to add to the wrong index
#     Year  Cat          Reading    Numeracy
# num
# ...
# g Year 9  Australia      90.80    93.1
# h Year 9  NSW            91.20    93.1
# Year 5    NaN            NaN  90.00   NaN
# i NaN NaN 96.00   NaN

#set cell  using iloc above cell
df_au.iloc[1,2] = 90.0
#     Year  Cat Reading Numeracy
# num
# a Year 3  Australia   93.90   94.3
# b Year 3  NSW        90.00    95.0
# c Year 5  Australia   91.30   93.7
# ...

#be careful not to add to the wrong place
df_au.iloc[8,2] = 90.0
#     Year  Cat Reading Numeracy
# num
# ...
# g Year 9  Australia   90.80   93.1
# h Year 9  NSW         91.20   93.1
#   Year 5  NaN          NaN    90.00   NaN

Written with StackEdit.

Comments

Popular posts from this blog

How To: GitHub projects in Spring Tool Suite

Spring 3 Part 7: Spring with Databases

Parse the namespace based XML using Python