This blog is a reference guide to the analytics which are suppose to address in upcoming blogs.

Python

This section covers only the Python

Sort with Lambda

As first, I would like to introduce how lambda can be used to sort the dictionary of word count (wc) based on the key or the value(name of the fruit and the count)

__author__ = 'ojitha'
wc ={'orange':2, 'mango':1, 'cherry':8, 'apple':5}

'''sort on key'''
print(sorted (wc.items(), key=lambda (word, count): word))

''' sort on the count'''
print(sorted (wc.items(), key=lambda (word, count): count))

The output of the above code is as follows

[('apple', 5), ('cherry', 8), ('mango', 1), ('orange', 2)]
[('mango', 1), ('orange', 2), ('apple', 5), ('cherry', 8)]

List Comprehensions

List of even and odds can be created as follows

listOfEvens = [x for x in range(10) if x %2 == 0]
print (listOfEvens)
listOfOdds = [x for x in range(10) if x %2 != 0]
print (listOfOdds)

The list comprehension is as follows

pairs = [(x,y) for x in range(5) for y in range(5)]
print(pairs)

This will create pairs as follows

[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (4, 0), (4, 1), (4, 2), (4, 3), (4, 4)

For example, consider the following csv dataset:

name,age,salary
Mike,20,3000
Ren,30,4050.41
Tom,25,2500.34
Hanks,40,5000.45

Following program reading the above data set to find the names of employees who’s salary is above 3000.

import csv
def read_csv(file_name):
    records = []
    with open(file_name,'r') as f:
        rows = csv.reader(f)
        headers=next(rows)
        for row in rows:
            rec = {}
            for i, val in enumerate(row):
                rec[headers[i]] = val

            records.append(rec)

    return records

dics = read_csv('t1.csv')
len(dics)
r1 = dics[1]
r1['salary']
total_salary= sum(float(r['salary']) for r in dics)

#list Comprehension
names = [r['name'] for r in dics if float(r['salary'])>3000]
print ' and '.join(names)

As list, you can do the comprehension for the other data structures such as dictionaries.

The dictionary of above code can be sorted as follows:

def sorting_key(rec):
    return rec['name']

dics.sort(key=sorting_key)

#or use the lambda
dics.sort(key = lambda x: x['name'])

map function

map function execute the function on sequence:

def multi(l,r):
    return l * r

x = [1,2,3,4]
y = [10,20,30,40]

print(map(multi,x, y))
#output : [10, 40, 90, 160]

filter function

Here the filter which execute logical operation on sequence

def is_even(x):
    return x % 2 == 0

x = [1,2,3,4,5,6,7,8,14,31,45]

print(filter(is_even, x))
#output [2, 4, 6, 8, 14]

reduce function

def multi(x, y): return x * y

x = [1,2,3,4]

print(reduce(multi, x))
#output: 24

pythonic way of enumeration

x = ['a','b','c','d']

for i,  j in enumerate(x):
    print i, j

zip and unzip

Here the example code to zip and unzip the lists

x = ['a','b','c','d']
y = [1,2,3,4]
z = ['p','q','r','s']

l = zip(x,y,z)

#zip
print l

#unzip
p,q,r = zip(*l)

print p
print q
print r

Output as follows

[('a', 1, 'p'), ('b', 2, 'q'), ('c', 3, 'r'), ('d', 4, 's')]
('a', 'b', 'c', 'd')
(1, 2, 3, 4)
('p', 'q', 'r', 's')

Json

Here the Json transferring using dictionary:

import json
rec = {'name':'Mike', 'age':45}
j = json.dumps(rec)
r = json.loads(j)

Pandas

Basics

Read the csv file and create a histogram is very simple in Pandas.

mport pandas as pd

df = pd.read_csv('t1.csv')
%matplotlib inline
df['salary'].hist()

Series

Pandas series are the most important, Here the data types:

import pandas as pd
serInt = pd.Series(range(1))
serInt.dtype
#dtype('int64')

serFloat = pd.Series([1.0,2.0])
serFloat.dtype
#dtype('float64')

setStr = pd.Series(['a','b'])
setStr.dtype
#dtype('O')

serdictInt = pd.Series({'one':1, 'two':2})
serdictInt.dtype
#dtype('int64')

serDictFloat = pd.Series({'one':1.0, 'two':2.0})
serDictFloat.dtype
#dtype('float64')

sertDictStr = pd.Series({'one':'I', 'two':'II'})
sertDictStr.dtype
#dtype('O')

sertDictStrKey = pd.Series({'one':'I', 'two':'II'})
sertDictStrKey.dtype
#dtype('O')

serDate = pd.Series([pd.to_datetime('2016-01-01')])
serDate.dtype
#dtype('<M8[ns]')

serCategorical = pd.Series(['a','b'],dtype='category')
serCategorical.dtype
#category

Iterating series

import pandas as pd

s = pd.Series([10,20,30,40])

for num in s:
    print(num)
# 10
# 20
# 30
# 40

1 in s
#True

10 in s
#False: because only index is looked

10 in s.values
#True or use the following
10 in set(s)
#True

#convert to dictionary
d = dict(s)
#{0: 10, 1: 20, 2: 30, 3: 40}

#directly iterate series
for lab, val in s.iteritems():
    print (lab, val)
# (0, 10)
# (1, 20)
# (2, 30)
# (3, 40)

#samething in dictionary
for lab, val in dict(s).iteritems():
    print (lab, val)
# (0, 10)
# (1, 20)
# (2, 30)
# (3, 40)

Broadcasting

import pandas as pd

s= [10,20,30]
ser = pd.Series(s)

ser + 2
# 0    12
# 1    22
# 2    32
# dtype: int64


#s + 2 never work in python
# but
s * 2
#[10, 20, 30, 10, 20, 30]

#completely different idea from the standard python.
ser * 2
# 0    20
# 1    40
# 2    60
# dtype: int64

#s + ser or
ser + s
# 0    20
# 1    40
# 2    60
# dtype: int64

#create second series
s1 = [40,50,60]
ser1 = pd.Series(s1)

ser + ser1
# 0    50
# 1    70
# 2    90
# dtype: int64

ser3 = pd.Series(s1, index=[2,3,4])
# 2    40
# 3    50
# 4    60
# dtype: int64

#only the instersecting elements are added rest of the other are NaN
ser + ser3
# 0     NaN
# 1     NaN
# 2    70.0
# 3     NaN
# 4     NaN
# dtype: float64

def disp(val):
    return val

ser.apply(disp)
# 0    10
# 1    20
# 2    30
# dtype: int64

def add_2(val):
    return val + 2

ser.apply(add_2)
# 0    12
# 1    22
# 2    32
# dtype: int64

#to convert to float
ser.apply(float)
# 0    10.0
# 1    20.0
# 2    30.0
# dtype: float64

#but better to use
ser.astype(str)
# 0    10
# 1    20
# 2    30
# dtype: object

Series position and label

import pandas as pd

s= [10,20,30]
ser = pd.Series(s)

s[0]
ser[0]
#10

#ser[-1] not work raised KeyError because there is no label with -1.
s[-1]
#30

#s[4] index out of range
#ser[4] is keyError

#s.loc[0] not work but:
ser.loc[0] #value based on the label
#10

#position based
#ser.loc[-1] not work but:
ser.iloc[-1]
#30

#ser.iloc[4] is IndexError

#but this is different from the string indexes,
#series use label always, not index. This is the proof
#character index
a = [1,2,3,4,5]
ser2 = pd.Series(a, name='ser2', index=['a','b','c','d','e'])
a[-1]
ser2[-1]
ser2.iloc[-1] #position based
#5
# ser2.loc[-1] label error
# ser2.loc[0] label error. But:
ser2.loc['a'] #label based
#1

#label based examples
ser3 = pd.Series(a, name='ser3', index = ['a','b',0,1,2])

ser3.iloc[0] #position
ser3['a']
#1

ser3.iloc[2] #position
ser3[0]
#3

Update Series

import pandas as pd

s= [10,20,30]
ser = pd.Series(s, index=['a','b','c'])
# a    10
# b    20
# c    30
# dtype: int64

#update the position
ser['c'] = 40
# a    10
# b    20
# c    40
# dtype: int64

ser.iloc[0] = 5
# a     5
# b    20
# c    30
# dtype: int64

#add series
ser.append(pd.Series([100])) #new series  returned but not mutate the existing.
# a      5
# b     20
# c     30
# 0    100
# dtype: int64

ser.set_value('a',-10) #mutate the series
# a   -10
# b    20
# c    30
# dtype: int64

#mutate with new values
ser.set_value('d',50)
# a   -10
# b    20
# c    30
# d    50
# dtype: int64
py

Delete and filter Series

import pandas as pd

d1 = {'one': 1, 'two':2}
#{'one': 1, 'two': 2}

del d1['two']
#{'one': 1}

s= [10,20,30]
ser = pd.Series(s, index=['a','b','c'])
# a    10
# b    20
# c    30
dtype: int64

del ser['b'] #mutated
# a    10
# c    30
# dtype: int64

#masking
mask = ser > 20
# a    False
# c     True
# dtype: bool

ser[mask]
# c    30
# dtype: int64

ser.index
#Index([u'a', u'c'], dtype='object')

#masking the index
mask = ser.index == 'a'
#array([ True, False], dtype=bool)

ser[mask]
# a    10
# dtype: int64

Summaries and Duplicates

import pandas as pd

s= [10,20,30,40,50]
ser = pd.Series(s, index=['a','b','c','d','e'])

ser.describe()
# count     5.000000
# mean     30.000000
# std      15.811388
# min      10.000000
# 25%      20.000000
# 50%      30.000000
# 75%      40.000000
# max      50.000000
# dtype: float64

ser.value_counts()
# 30    1
# 20    1
# 10    1
# 50    1
# 40    1
# dtype: int64

#categorical data
cat= ['mango','apple','banna','grape','date','apple','pinapple','apple','banna']
catser = pd.Series(cat, dtype='category')
catser.describe()
# count         9
# unique        6
# top       apple
# freq          3
# dtype: object

catser.value_counts()
# apple       3
# banna       2
# pinapple    1
# mango       1
# grape       1
# date        1
# dtype: int64

ser.duplicated()
# a    False
# b    False
# c    False
# d    False
# e    False
# dtype: bool

ser1 = ser.append(pd.Series([10],index=['f']))
# a    10
# b    20
# c    30
# d    40
# e    50
# f    10
# dtype: int64

ser1.duplicated() #last is duplicated with first one
# a    False
# b    False
# c    False
# d    False
# e    False
# f     True
# dtype: bool

ser.duplicated().any()
#False

ser1.duplicated().any()
#True

ser1.duplicated().all()
#False

ser1.duplicated(keep='last')
# a     True
# b    False
# c    False
# d    False
# e    False
# f    False
# dtype: bool

ser1.duplicated(keep=False)
# a     True
# b    False
# c    False
# d    False
# e    False
# f     True
# dtype: bool

#fitler the duplicated
mask = ser1.duplicated(keep=False)
ser1[mask]
# a    10
# f    10
# dtype: int64

#who were not duplicated
ser1[~mask]
# b    20
# c    30
# d    40
# e    50
# dtype: int64

ser1.drop_duplicates(keep=False) //immutable
# b    20
# c    30
# d    40
# e    50
# dtype: int64

#add the duplicated keys
ser_key_dups = ser.append(pd.Series([100], index=['a']))
# a     10
# b     20
# c     30
# d     40
# e     50
# a    100
# dtype: int64

ser_key_dups['a']
# a     10
# a    100
# dtype: int64

Work with NaN

import pandas as pd

s= [10,20,30,40,None]
ser = pd.Series(s, index=['a','b','c','d','e'])
# a    10.0
# b    20.0
# c    30.0
# d    40.0
# e     NaN
# dtype: float64

s1= [10,20,30,40,50]
ser1 = pd.Series(s1, index=['a','b','c','d',None])
# a      10
# b      20
# c      30
# d      40
# NaN    50
# dtype: int64

ser2 = ser1.append( pd.Series([None], index=['f']))
# 0    None
# dtype: object

len(ser2)
#6

ser2.count()
#5

ser2.isnull()
# a      False
# b      False
# c      False
# d      False
# NaN    False
# f       True
# dtype: bool

ser2.isnull().any()
#True

ser2.isnull().all()
#False

mask = ser2.isnull()
# a      False
# b      False
# c      False
# d      False
# NaN    False
# f       True
# dtype: bool
ser2[mask]
# f    None
# dtype: object

ser2.dropna()
# a      10
# b      20
# c      30
# d      40
# NaN    50
# dtype: object

ser2.fillna(method='ffill')
# a      10
# b      20
# c      30
# d      40
# NaN    50 --fill the --\
# f      50 <------------/
# dtype: int64

import numpy as np
pd.Series([np.nan])
# 0   NaN
# dtype: float64

Series -> csv -> Data Frame

it is interesting to save series to csv file and read the series back. You can create Data Frame from the same csv file.

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

s= [10,20,5,40,50]
ser = pd.Series(s, name='dummy' ,index=['a','b','c','d','e'])

ser.name
ax = ser.plot()
ax.set_xlabel("x label")
ax.set_ylabel("y label")

#save series to file
ser.to_csv('test.csv', header=True, index_label='no#')
#read from a file
serff= pd.Series.from_csv('test.csv')
#read with the headers
serffh = pd.Series.from_csv('test.csv',header=0)
# no#
# a    10
# b    20
# c     5
# d    40
# e    50
# Name: dummy, dtype: int64

#read as data frame (df)
df = pd.read_csv('test.csv')
# no#   dummy
# 0 a   10
# 1 b   20
# 2 c   5
# 3 d   40
# 4 e   50

ser_from_df = df['dummy']
# 0    10
# 1    20
# 2     5
# 3    40
# 4    50
# Name: dummy, dtype: int64

Appendix

Mortgage payment equation $payment = P * r(1+r)^n/((1+r)^n-1)$

principal = 400000
rate = 0.0535
monthly_rate= rate/12
payment_period = 30
no_of_installments = payment_period * 12
r = math.pow((monthly_rate+1),no_of_installments)
payment = (monthly_rate * r)/(r - 1) * principal
print payment

here $P$ is principal, $r$ monthly rate, $n$ is number of instalments.

Search This Blog

Ojitha Hewa Kumanayaka

python fun