Introduction

This is Panda exercises:

Data Frames

In this section, I explore how to create data frames from different ways:

list
dictionary
Json
csv file

In addition to that basic Data Frame (DF) manipulations:

import pandas as pd

cols = { 'name': ['Ted', 'Mak', 'Nina','Leo']
        ,'age' : [50, 20, 33, 25] }

l_students= [{'name':'Tailor', 'grade':'10','math':60}
            ,{'name':'Lora', 'grade':'09','math':80}
            ,{'name':'Joe', 'grade':'11','math':56.90}
            ,{'name':'Tailor', 'grade':'11','math':68.98} ]

studemtDF = pd.DataFrame(l_students)

# read from the json
import json
json_students = json.dumps(l_students)
# [
#   {
#     "grade": "10",
#     "name": "Tailor",
#     "math": 60
#   },
#   {
#     "grade": "09",
#     "name": "Lora",
#     "math": 80
#   },
#   {
#     "grade": "11",
#     "name": "Joe",
#     "math": 56.9
#   },
#   {
#     "grade": "11",
#     "name": "Tailor",
#     "math": 68.98
#   }
# ]
json_std_df = pd.read_json(json_students)
#   grade   math    name
# 0 10  60.00   Tailor
# 1 9   80.00   Lora
# 2 11  56.90   Joe
# 3 11  68.98   Tailor

json_cols = json.dumps(cols)
# {
#   "age": [
#     50,
#     20,
#     33,
#     25
#   ],
#   "name": [
#     "Ted",
#     "Mak",
#     "Nina",
#     "Leo"
#   ]
# }
json_cols_df = pd.read_json(json_cols)
#   age name
# 0 50  Ted
# 1 20  Mak
# 2 33  Nina
# 3 25  Leo

peopleDF = pd.DataFrame(cols)
# age   name
# 0 50  Ted
# 1 20  Mak
# 2 33  Nina
# 3 25  Leo

peopleDF.info()
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 4 entries, 0 to 3
# Data columns (total 2 columns):
# age     4 non-null int64
# name    4 non-null object
# dtypes: int64(1), object(1)
# memory usage: 136.0+ bytes

peopleDF.columns
#Index([u'age', u'name'], dtype='object')

peopleDF.age
# 0    50
# 1    20
# 2    33
# 3    25
# Name: age, dtype: int64

#or
peopleDF['age']

#describe
peopleDF.describe()
# age
# count 4.000000
# mean  32.000000
# std   13.140269
# min   20.000000
# 25%   23.750000
# 50%   29.000000
# 75%   37.250000
# max   50.000000

#create new column to existing df
peopleDF['city'] = pd.Series(['Sydney','Canberra', 'Melbourne', 'Darwin'])
#  age  name    city
# 0 50  Ted Sydney
# 1 20  Mak Canberra
# 2 33  Nina    Melbourne
# 3 25  Leo Darwin

peopleDF['eat'] = pd.Series(['rice','bread','bread','meat'],index=[3,1,0,6])
#   age name    city    eat
# 0 50  Ted Sydney  bread
# 1 20  Mak Canberra    bread
# 2 33  Nina    Melbourne   NaN
# 3 25  Leo Darwin  rice

#broadcasting
peopleDF['country'] = 'Australia'
#   age name    city    eat country
# 0 50  Ted Sydney  bread   Australia
# 1 20  Mak Canberra    bread   Australia
# 2 33  Nina    Melbourne   NaN Australia
# 3 25  Leo Darwin  rice    Australia

#slice DataFrame
counDF = peopleDF[['name', 'country']]
#   name    country
# 0 Ted Australia
# 1 Mak Australia
# 2 Nina    Australia
# 3 Leo Australia

del counDF['name']
#     country
# 0 Australia
# 1 Australia
# 2 Australia
# 3 Australia

#pop out the age, this mutate the peopleDF
peopleDF.pop('age')
#   name    city    eat country
# 0 Ted Sydney  bread   Australia
# 1 Mak Canberra    bread   Australia
# 2 Nina    Melbourne   NaN Australia
# 3 Leo Darwin  rice    Australia

Selection and Projection

import pandas as pd

cols = { 'name': ['Ted', 'Mak', 'Nina','Leo']
        ,'age' : [50, 20, 33, 25] }
df = pd.DataFrame(cols)


df.name == 'Nina'
# 0    False
# 1    False
# 2     True
# 3    False
# Name: name, dtype: bool

#masking
df[df.name == 'Nina']
# age   name
# 2 33  Nina

df.name.str.contains('e')
# 0     True
# 1    False
# 2    False
# 3     True
# Name: name, dtype: bool

df[(df.name.str.contains('e')) & (df.age < 30)]
#   age name
# 3 25  Leo
# df[(df.name.str.contains('e')) and (df.age < 30)] will not work because python 'and'.

#from numexpr (pip install numexpr if not available. My case it was by default)
df.query('age > 25')
#   age name
# 0 50  Ted
# 2 33  Nina

# and <==> &
df.query('age > 25 and name == "Ted"')
#   age name
# 0 50  Ted

# or <==> |
df.query('age > 25 | name == "Leo"')
# age   name
# 0 50  Ted
# 2 33  Nina
# 3 25  Leo

# how not
~(df.name == 'Nina')
# 0     True
# 1     True
# 2    False
# 3     True
# Name: name, dtype: bool
(df.name != 'Nina') # same as above
df[~(df.name == 'Nina')]
# age   name
# 0 50  Ted
# 1 20  Mak
# 3 25  Leo
df.query('not name == "Nina"') #same as above

#get last two rows
df.tail(2)
# age   name
# 2 33  Nina
# 3 25  Leo

# get first two rows
df.head(2)
# age   name
# 0 50  Ted
# 1 20  Mak
# or
df[:2] #slicing

#position based: get first and last
df.iloc[[0,3]]
# age   name
# 0 50  Ted
# 3 25  Leo

#projection
df[['name', 'age']]
# name  age
# 0 Ted 50
# 1 Mak 20
# 2 Nina    33
# 3 Leo 25

#position based projection
df.iloc[:2, :1] # two records with second column
# age
# 0 50
# 1 20


df.loc[:2, ['name']]
# name
# 0 Ted
# 1 Mak
# 2 Nina

Merge

import pandas as pd

cols_age = { 'name': ['Ted', 'Mak', 'Nina','Leo',"Phips"]
        ,'age' : [50, 20, 33, 25,40] }
df_age = pd.DataFrame(cols_age)

cols_marks = { 'name': ['Ted', 'Mak', 'Nina','Leo','Rino']
        ,'marks' : [78.50, 80.90, 33.89, 25, 67] }

df_marks = pd.DataFrame(cols_marks)

pd.merge(df_age,df_marks, on='name', how='inner')
#   age name    marks
# 0 50  Ted 78.50
# 1 20  Mak 80.90
# 2 33  Nina    33.89
# 3 25  Leo 25.00

pd.merge(df_marks,df_age, on='name')
#   marks   name    age
# 0 78.50   Ted 50
# 1 80.90   Mak 20
# 2 33.89   Nina    33
# 3 25.00   Leo 25

pd.merge(df_age,df_marks, on='name', how='left')
#   age name    marks
# 0 50  Ted 78.50
# 1 20  Mak 80.90
# 2 33  Nina    33.89
# 3 25  Leo 25.00
# 4 40  Phips   NaN

pd.merge(df_age,df_marks, on='name', how='right')
#   age name    marks
# 0 50.0    Ted 78.50
# 1 20.0    Mak 80.90
# 2 33.0    Nina    33.89
# 3 25.0    Leo 25.00
# 4 NaN Rino    67.00

pd.merge(df_age,df_marks, on='name', how='outer')
#   age name    marks
# 0 50.0    Ted 78.50
# 1 20.0    Mak 80.90
# 2 33.0    Nina    33.89
# 3 25.0    Leo 25.00
# 4 40.0    Phips   NaN
# 5 NaN Rino    67.00

#copy df
df_fname = df_marks.copy(deep=True)
df_fname = df_fname.rename(columns={'name':'fname'})

pd.merge(df_fname,df_age, left_on='fname', right_on='name')
#   marks   fname   age name
# 0 78.50   Ted 50  Ted
# 1 80.90   Mak 20  Mak
# 2 33.89   Nina    33  Nina
# 3 25.00   Leo 25  Leo

Let’s do more with the merge operation similar to database join

import numpy as np
import pandas as pd

df_emp = pd.DataFrame({
    'name':['Alex','Clara','Dany','John','Philip','Marko'],
    'age':[45,35,23,39,43,21],
    'comp_id':['001','002','003','004','001','004']
    })

df_comp = pd.DataFrame({
    'comp_id':['001','002','003','004'],
    'state':['ACT','ACT','NSW','NSW'],
    'name':['DHS','IMMG','ABC','Ticketek']
    })

#concat
pd.concat([df_emp,df_comp])
#   age comp_id name       state
# 0 45.0    001 Alex       NaN
# 1 35.0    002 Clara      NaN
# 2 23.0    003 Dany       NaN
# 3 39.0    004 John       NaN
# 4 43.0    001 Philip     NaN
# 5 21.0    004 Marko      NaN
# 0 NaN     001 DHS        ACT
# 1 NaN     002 IMMG       ACT
# 2 NaN     003 ABC        NSW
# 3 NaN     004 Ticketek   NSW

pd.concat([df_emp,df_comp],verify_integrity=True)
#error because index overlapping
#therefore ingnore the index if necessary
pd.concat([df_emp,df_comp],ignore_index=True)

pd.concat([df_emp,df_comp],join='inner')
#   comp_id name
# 0 001 Alex
# 1 002 Clara
# 2 003 Dany
# 3 004 John
# 4 001 Philip
# 5 004 Marko
# 0 001 DHS
# 1 002 IMMG
# 2 003 ABC
# 3 004 Ticketek

pd.concat([df_emp,df_comp], axis=1)
#     age   comp_id name    comp_id name    state
# 0 45  001 Alex    001     DHS             ACT
# 1 35  002 Clara   002     IMMG            ACT
# 2 23  003 Dany    003     ABC             NSW
# 3 39  004 John    004     Ticketek        NSW
# 4 43  001 Philip  NaN     NaN             NaN
# 5 21  004 Marko   NaN     NaN             NaN

pd.concat([df_emp,df_comp], axis=1, join='inner')
#   age comp_id name    comp_id name    state
# 0 45  001      Alex   001     DHS             ACT
# 1 35  002      Clara  002     IMMG            ACT
# 2 23  003      Dany   003     ABC             NSW
# 3 39  004      John   004     Ticketek        NSW

df_emp.merge(df_comp,how='inner', indicator=True, 
    left_on='comp_id', right_on='comp_id', suffixes=('_emp', '_comp'))
#   age comp_id     name_emp    name_comp   state   _merge
# 0 45  001         Alex        DHS         ACT     both
# 1 43  001         Philip      DHS         ACT     both
# 2 35  002         Clara       IMMG        ACT     both
# 3 23  003         Dany        ABC         NSW     both
# 4 39  004         John        Ticketek    NSW     both
# 5 21  004         Marko       Ticketek    NSW     both

df_emp.join(df_comp, lsuffix='_emp', rsuffix='_comp', on='comp_id', how='left')

Operaions to Data Frames

Adding, renaming and delete column
changes to categorical type
axes and sort operations
group by operation
append and concat of two DFs
iterate over columns and rows

import pandas as pd

'''
National Assessment Program – Literacy and Numeracy (NAPLAN) - 2010 data
URL: https://data.gov.au/dataset/education-national-assessment-program-literacy-and-numeracy-nsw
'''
df = pd.read_csv('./education/gender-Table 1.csv'
    , names=['Year','Gender','Reading', 'Numeracy']
    , skiprows=[0], usecols=[0,1,2,3]
    )
#     Year  Gender  Reading Numeracy
# 0 Year 3  Male    94.0    94.5
# 1 Year 3  Female  96.6    95.6
# 2 Year 5  Male    91.3    94.4
# 3 Year 5  Female  94.8    94.9
# 4 Year 7  Male    93.7    94.7
# 5 Year 7  Female  96.4    95.1
# 6 Year 9  Male    88.9    93.4
# 7 Year 9  Female  93.6    92.8

df.Gender.value_counts()
# Male      4
# Female    4
# Name: Gender, dtype: int64

#change to the categorical type
df['Gender'] = df.Gender.astype('category')
df.shape
#rows and cols : (8, 4)

df.axes
# [RangeIndex(start=0, stop=8, step=1),
#  Index([u'Year', u'Gender', u'Reading', u'Numeracy'], dtype='object')]

df.axes[0]
#RangeIndex(start=0, stop=8, step=1)
#or
df.index
df.axes[0] is df.index
#True

df.axes[1]
#Index([u'Year', u'Gender', u'Reading', u'Numeracy'], dtype='object')
df.sort_index(axis=1)

df.index.unique()
df.index.duplicated().any()
df.index.duplicated().all()

df.columns
df.columns.duplicated()

df.describe()
# Reading   Numeracy
# count 8.000000    8.000000
# mean  93.662500   94.425000
# std   2.557866    0.913001
# min   88.900000   92.800000
# 25%   93.025000   94.150000
# 50%   93.850000   94.600000
# 75%   95.200000   94.950000
# max   96.600000   95.600000

df.describe(include='all')
# Year  Gender  Reading Numeracy
# count 8   8   8.000000    8.000000
# unique    4   2   NaN NaN
# top   Year 3  Male    NaN NaN
# freq  2   4   NaN NaN
# mean  NaN NaN 93.662500   94.425000
# std   NaN NaN 2.557866    0.913001
# min   NaN NaN 88.900000   92.800000
# 25%   NaN NaN 93.025000   94.150000
# 50%   NaN NaN 93.850000   94.600000
# 75%   NaN NaN 95.200000   94.950000
# max   NaN NaN 96.600000   95.600000

df.groupby('Gender').sum()
#         Reading   Numeracy
# Gender
# Female    381.4   378.4
# Male  367.9   377.0

%matplotlib inline
ax = df['Reading'].plot(kind='box')

df_au = pd.read_csv('./education/australia-Table 1.csv'
    , names=['Year','Cat','Reading', 'Numeracy']
    , skiprows=[0], usecols=[0,1,2,3]
    )

df_ind = pd.read_csv('./education/indigenous-Table 1.csv'
    , names=['Year','Cat','Reading', 'Numeracy']
    , skiprows=[0], usecols=[0,1,2,3]
    )
df_au.append(df_ind)
#     Year  Cat Reading Numeracy
# 0 Year 3  Australia   93.9    94.3
# 1 Year 3  NSW 95.3    95.0
# 2 Year 5  Australia   91.3    93.7
# ...
# 0 Year 3  Aboriginal  85.5    83.9
# 1 Year 3  non-Aboriginal  95.7    95.6
# 2 Year 5  Aboriginal  77.7    80.9
# ...
# same above
pd.concat([df_au,df_ind])

#reset the index
df_nsw=pd.concat([df_au,df_ind],ignore_index=True)
df_nsw.iloc[1]
# Year        Year 3
# Cat            NSW
# Reading       95.3
# Numeracy        95
# Name: 1, dtype: object

#update existing
df_nsw.iloc[1] = ['Year 3','NSW',99,98]

#to add new
len(df)
#8

#add a record
df.loc[8] = ['Year 3','NSW',99,98]
#     Year  Gender  Reading Numeracy
# 0 Year 3  Male    94.0    94.5
# 1 Year 3  Female  96.6    95.6
# 2 Year 5  Male    91.3    94.4
# 3 Year 5  Female  94.8    94.9
# 4 Year 7  Male    93.7    94.7
# 5 Year 7  Female  96.4    95.1
# 6 Year 9  Male    88.9    93.4
# 7 Year 9  Female  93.6    92.8
# 8 Year 3  NaN 99.0    98.0

#add a new column. You will get an error if the list is in different dimension
df['given'] =[2,3,4,9,7,5,6,2,4]

#create a column with one default value
df['given2'] = 2
#or iterate such as
df.loc[2:5]
df.loc[:,'given3'] = '3'

#calculated column
df['avg']=(df['Reading']+df['Numeracy'])/2

#based on function
def tot(cols):
    read, nume = cols
    return read + nume
#this will create new series which can be added to the df
d = df[['Reading', 'Numeracy']].apply(tot,axis=1)

#filter the df
df[['Year','Reading']]
#but to mutate the df and delete the column
df.pop('given')
#same as above but not return the deleted column as above
del df['given2']

#rename the column
df.rename(columns={'given3': 'given'})

#rename the index

df.sort_index(ascending=False)
df.sort_values('Gender', ascending=False)
#    Year   Gender  Reading Numeracy    given3  avg
# 6 Year 9  Male    88.9    93.4    3   91.15
# 4 Year 7  Male    93.7    94.7    3   94.20
# 2 Year 5  Male    91.3    94.4    3   92.85
# 0 Year 3  Male    94.0    94.5    3   94.25
# 7 Year 9  Female  93.6    92.8    3   93.20
# 5 Year 7  Female  96.4    95.1    3   95.75
# 3 Year 5  Female  94.8    94.9    3   94.85
# 1 Year 3  Female  96.6    95.6    3   96.10
# 8 Year 3  NaN 99.0    98.0    3   98.50

df['given'] = [5,3,4,6,3,4,5,1,2]
df.sort_values(['given', 'Gender'], ascending=[False, False])

#column sort

df.sort_index(axis=1)
#     Gender    Numeracy    Reading Year
# 0 Male    94.5    94.0    Year 3
# 1 Female  95.6    96.6    Year 3
# 2 Male    94.4    91.3    Year 5
# 3 Female  94.9    94.8    Year 5
# 4 Male    94.7    93.7    Year 7
# 5 Female  95.1    96.4    Year 7
# 6 Male    93.4    88.9    Year 9
# 7 Female  92.8    93.6    Year 9

#if you iterate the data frame
list(df)
#['Year', 'Gender', 'Reading', 'Numeracy']
df.iteritems #iterate over column
# for example
for name, col in df.iteritems():
    print (name, col)
    break
# ('Year', 0    Year 3
# 1    Year 3
# 2    Year 5
# 3    Year 5
# 4    Year 7
# 5    Year 7
# 6    Year 9
# 7    Year 9
# Name: Year, dtype: object)

#to iterate over rows
for tup in df.itertuples():
    print(tup)
# Pandas(Index=0, Year='Year 3', Gender='Male', Reading=94.0, Numeracy=94.5)
# Pandas(Index=1, Year='Year 3', Gender='Female', Reading=96.599999999999994, Numeracy=95.599999999999994)
# Pandas(Index=2, Year='Year 5', Gender='Male', Reading=91.299999999999997, Numeracy=94.400000000000006)
# Pandas(Index=3, Year='Year 5', Gender='Female', Reading=94.799999999999997, Numeracy=94.900000000000006)
# Pandas(Index=4, Year='Year 7', Gender='Male', Reading=93.700000000000003, Numeracy=94.700000000000003)
# Pandas(Index=5, Year='Year 7', Gender='Female', Reading=96.400000000000006, Numeracy=95.099999999999994)
# Pandas(Index=6, Year='Year 9', Gender='Male', Reading=88.900000000000006, Numeracy=93.400000000000006)
# Pandas(Index=7, Year='Year 9', Gender='Female', Reading=93.599999999999994, Numeracy=92.799999999999997)

for row in df.iterrows():
    print(row)
# (0, Year        Year 3
# Gender        Male
# Reading         94
# Numeracy      94.5
# Name: 0, dtype: object)
# (1, Year        Year 3
# Gender      Female
# Reading       96.6
# Numeracy      95.6
# Name: 1, dtype: object)
# ...

#set index to other column
import string
df_au['num']= [x for x in string.lowercase[0:8]]
df_au =df_au.set_index('num')
#     Year  Cat Reading Numeracy
# num
# a Year 3  Australia   93.9    94.3
# b Year 3  NSW         95.3    95.0
# c Year 5  Australia   91.3    93.7
# d Year 5  NSW         93.0    94.7
# e Year 7  Australia   94.9    95.1
# f Year 7  NSW         95.0    94.9
# g Year 9  Australia   90.8    93.1
# h Year 9  NSW 91.2    93.1

#set the cell value: this will mutate the DF
df_au.set_value('d','Reading',99.89)
#     Year  Cat Reading Numeracy
# num
# ...
# c Year 5  Australia   91.30   93.7
# d Year 5  NSW         99.89   94.7
# e Year 7  Australia   94.90   95.1
# ...

#set cell using loc
df_au.loc['b','Reading'] = 96.0
#     Year  Cat Reading Numeracy
# num
# a Year 3  Australia   93.90   94.3
# b Year 3  NSW        96.00    95.0
# c Year 5  Australia   91.30   93.7
# ...
df_au.loc['i','Reading'] = 96.0 #be careful not to add to the wrong index
#     Year  Cat          Reading    Numeracy
# num
# ...
# g Year 9  Australia      90.80    93.1
# h Year 9  NSW            91.20    93.1
# Year 5    NaN            NaN  90.00   NaN
# i NaN NaN 96.00   NaN

#set cell  using iloc above cell
df_au.iloc[1,2] = 90.0
#     Year  Cat Reading Numeracy
# num
# a Year 3  Australia   93.90   94.3
# b Year 3  NSW        90.00    95.0
# c Year 5  Australia   91.30   93.7
# ...

#be careful not to add to the wrong place
df_au.iloc[8,2] = 90.0
#     Year  Cat Reading Numeracy
# num
# ...
# g Year 9  Australia   90.80   93.1
# h Year 9  NSW         91.20   93.1
#   Year 5  NaN          NaN    90.00   NaN

Written with StackEdit.

Search This Blog

Ojitha Kumanayaka

Pandas Land

Introduction

Data Frames

Selection and Projection

Merge

Operaions to Data Frames

Comments

Post a Comment

Popular posts from this blog

How To: GitHub projects in Spring Tool Suite

Spring 3 Part 7: Spring with Databases

Parse the namespace based XML using Python