python fun

This blog is a reference guide to the analytics which are suppose to address in upcoming blogs.

Python

This section covers only the Python

Sort with Lambda

As first, I would like to introduce how lambda can be used to sort the dictionary of word count (wc) based on the key or the value(name of the fruit and the count)

__author__ = 'ojitha'
wc ={'orange':2, 'mango':1, 'cherry':8, 'apple':5}

'''sort on key'''
print(sorted (wc.items(), key=lambda (word, count): word))

''' sort on the count'''
print(sorted (wc.items(), key=lambda (word, count): count))

The output of the above code is as follows

[('apple', 5), ('cherry', 8), ('mango', 1), ('orange', 2)]
[('mango', 1), ('orange', 2), ('apple', 5), ('cherry', 8)]

List Comprehensions

List of even and odds can be created as follows

listOfEvens = [x for x in range(10) if x %2 == 0]
print (listOfEvens)
listOfOdds = [x for x in range(10) if x %2 != 0]
print (listOfOdds)

The list comprehension is as follows

pairs = [(x,y) for x in range(5) for y in range(5)]
print(pairs)

This will create pairs as follows

[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (4, 0), (4, 1), (4, 2), (4, 3), (4, 4)

For example, consider the following csv dataset:

name,age,salary
Mike,20,3000
Ren,30,4050.41
Tom,25,2500.34
Hanks,40,5000.45

Following program reading the above data set to find the names of employees who’s salary is above 3000.

import csv
def read_csv(file_name):
    records = []
    with open(file_name,'r') as f:
        rows = csv.reader(f)
        headers=next(rows)
        for row in rows:
            rec = {}
            for i, val in enumerate(row):
                rec[headers[i]] = val

            records.append(rec)

    return records

dics = read_csv('t1.csv')
len(dics)
r1 = dics[1]
r1['salary']
total_salary= sum(float(r['salary']) for r in dics)

#list Comprehension
names = [r['name'] for r in dics if float(r['salary'])>3000]
print ' and '.join(names)

As list, you can do the comprehension for the other data structures such as dictionaries.

The dictionary of above code can be sorted as follows:

def sorting_key(rec):
    return rec['name']

dics.sort(key=sorting_key)

#or use the lambda
dics.sort(key = lambda x: x['name'])

map function

map function execute the function on sequence:

def multi(l,r):
    return l * r

x = [1,2,3,4]
y = [10,20,30,40]

print(map(multi,x, y))
#output : [10, 40, 90, 160]

filter function

Here the filter which execute logical operation on sequence

def is_even(x):
    return x % 2 == 0

x = [1,2,3,4,5,6,7,8,14,31,45]

print(filter(is_even, x))
#output [2, 4, 6, 8, 14]

reduce function

def multi(x, y): return x * y

x = [1,2,3,4]

print(reduce(multi, x))
#output: 24

pythonic way of enumeration

x = ['a','b','c','d']

for i,  j in enumerate(x):
    print i, j

zip and unzip

Here the example code to zip and unzip the lists

x = ['a','b','c','d']
y = [1,2,3,4]
z = ['p','q','r','s']

l = zip(x,y,z)

#zip
print l

#unzip
p,q,r = zip(*l)

print p
print q
print r

Output as follows

[('a', 1, 'p'), ('b', 2, 'q'), ('c', 3, 'r'), ('d', 4, 's')]
('a', 'b', 'c', 'd')
(1, 2, 3, 4)
('p', 'q', 'r', 's')

Json

Here the Json transferring using dictionary:

import json
rec = {'name':'Mike', 'age':45}
j = json.dumps(rec)
r = json.loads(j)

Pandas

Basics

Read the csv file and create a histogram is very simple in Pandas.

mport pandas as pd

df = pd.read_csv('t1.csv')
%matplotlib inline
df['salary'].hist()

Series

Pandas series are the most important, Here the data types:

import pandas as pd
serInt = pd.Series(range(1))
serInt.dtype
#dtype('int64')

serFloat = pd.Series([1.0,2.0])
serFloat.dtype
#dtype('float64')

setStr = pd.Series(['a','b'])
setStr.dtype
#dtype('O')

serdictInt = pd.Series({'one':1, 'two':2})
serdictInt.dtype
#dtype('int64')

serDictFloat = pd.Series({'one':1.0, 'two':2.0})
serDictFloat.dtype
#dtype('float64')

sertDictStr = pd.Series({'one':'I', 'two':'II'})
sertDictStr.dtype
#dtype('O')

sertDictStrKey = pd.Series({'one':'I', 'two':'II'})
sertDictStrKey.dtype
#dtype('O')

serDate = pd.Series([pd.to_datetime('2016-01-01')])
serDate.dtype
#dtype('<M8[ns]')

serCategorical = pd.Series(['a','b'],dtype='category')
serCategorical.dtype
#category

Iterating series

import pandas as pd

s = pd.Series([10,20,30,40])

for num in s:
    print(num)
# 10
# 20
# 30
# 40

1 in s
#True

10 in s
#False: because only index is looked

10 in s.values
#True or use the following
10 in set(s)
#True

#convert to dictionary
d = dict(s)
#{0: 10, 1: 20, 2: 30, 3: 40}

#directly iterate series
for lab, val in s.iteritems():
    print (lab, val)
# (0, 10)
# (1, 20)
# (2, 30)
# (3, 40)

#samething in dictionary
for lab, val in dict(s).iteritems():
    print (lab, val)
# (0, 10)
# (1, 20)
# (2, 30)
# (3, 40)

Broadcasting

import pandas as pd

s= [10,20,30]
ser = pd.Series(s)

ser + 2
# 0    12
# 1    22
# 2    32
# dtype: int64


#s + 2 never work in python
# but
s * 2
#[10, 20, 30, 10, 20, 30]

#completely different idea from the standard python.
ser * 2
# 0    20
# 1    40
# 2    60
# dtype: int64

#s + ser or
ser + s
# 0    20
# 1    40
# 2    60
# dtype: int64

#create second series
s1 = [40,50,60]
ser1 = pd.Series(s1)

ser + ser1
# 0    50
# 1    70
# 2    90
# dtype: int64

ser3 = pd.Series(s1, index=[2,3,4])
# 2    40
# 3    50
# 4    60
# dtype: int64

#only the instersecting elements are added rest of the other are NaN
ser + ser3
# 0     NaN
# 1     NaN
# 2    70.0
# 3     NaN
# 4     NaN
# dtype: float64

def disp(val):
    return val

ser.apply(disp)
# 0    10
# 1    20
# 2    30
# dtype: int64

def add_2(val):
    return val + 2

ser.apply(add_2)
# 0    12
# 1    22
# 2    32
# dtype: int64

#to convert to float
ser.apply(float)
# 0    10.0
# 1    20.0
# 2    30.0
# dtype: float64

#but better to use
ser.astype(str)
# 0    10
# 1    20
# 2    30
# dtype: object

Series position and label

import pandas as pd

s= [10,20,30]
ser = pd.Series(s)

s[0]
ser[0]
#10

#ser[-1] not work raised KeyError because there is no label with -1.
s[-1]
#30

#s[4] index out of range
#ser[4] is keyError

#s.loc[0] not work but:
ser.loc[0] #value based on the label
#10

#position based
#ser.loc[-1] not work but:
ser.iloc[-1]
#30

#ser.iloc[4] is IndexError

#but this is different from the string indexes,
#series use label always, not index. This is the proof
#character index
a = [1,2,3,4,5]
ser2 = pd.Series(a, name='ser2', index=['a','b','c','d','e'])
a[-1]
ser2[-1]
ser2.iloc[-1] #position based
#5
# ser2.loc[-1] label error
# ser2.loc[0] label error. But:
ser2.loc['a'] #label based
#1

#label based examples
ser3 = pd.Series(a, name='ser3', index = ['a','b',0,1,2])

ser3.iloc[0] #position
ser3['a']
#1

ser3.iloc[2] #position
ser3[0]
#3

Update Series

import pandas as pd

s= [10,20,30]
ser = pd.Series(s, index=['a','b','c'])
# a    10
# b    20
# c    30
# dtype: int64

#update the position
ser['c'] = 40
# a    10
# b    20
# c    40
# dtype: int64

ser.iloc[0] = 5
# a     5
# b    20
# c    30
# dtype: int64

#add series
ser.append(pd.Series([100])) #new series  returned but not mutate the existing.
# a      5
# b     20
# c     30
# 0    100
# dtype: int64

ser.set_value('a',-10) #mutate the series
# a   -10
# b    20
# c    30
# dtype: int64

#mutate with new values
ser.set_value('d',50)
# a   -10
# b    20
# c    30
# d    50
# dtype: int64
py

Delete and filter Series

import pandas as pd

d1 = {'one': 1, 'two':2}
#{'one': 1, 'two': 2}

del d1['two']
#{'one': 1}

s= [10,20,30]
ser = pd.Series(s, index=['a','b','c'])
# a    10
# b    20
# c    30
dtype: int64

del ser['b'] #mutated
# a    10
# c    30
# dtype: int64

#masking
mask = ser > 20
# a    False
# c     True
# dtype: bool

ser[mask]
# c    30
# dtype: int64

ser.index
#Index([u'a', u'c'], dtype='object')

#masking the index
mask = ser.index == 'a'
#array([ True, False], dtype=bool)

ser[mask]
# a    10
# dtype: int64

Summaries and Duplicates

import pandas as pd

s= [10,20,30,40,50]
ser = pd.Series(s, index=['a','b','c','d','e'])

ser.describe()
# count     5.000000
# mean     30.000000
# std      15.811388
# min      10.000000
# 25%      20.000000
# 50%      30.000000
# 75%      40.000000
# max      50.000000
# dtype: float64

ser.value_counts()
# 30    1
# 20    1
# 10    1
# 50    1
# 40    1
# dtype: int64

#categorical data
cat= ['mango','apple','banna','grape','date','apple','pinapple','apple','banna']
catser = pd.Series(cat, dtype='category')
catser.describe()
# count         9
# unique        6
# top       apple
# freq          3
# dtype: object

catser.value_counts()
# apple       3
# banna       2
# pinapple    1
# mango       1
# grape       1
# date        1
# dtype: int64

ser.duplicated()
# a    False
# b    False
# c    False
# d    False
# e    False
# dtype: bool

ser1 = ser.append(pd.Series([10],index=['f']))
# a    10
# b    20
# c    30
# d    40
# e    50
# f    10
# dtype: int64

ser1.duplicated() #last is duplicated with first one
# a    False
# b    False
# c    False
# d    False
# e    False
# f     True
# dtype: bool

ser.duplicated().any()
#False

ser1.duplicated().any()
#True

ser1.duplicated().all()
#False

ser1.duplicated(keep='last')
# a     True
# b    False
# c    False
# d    False
# e    False
# f    False
# dtype: bool

ser1.duplicated(keep=False)
# a     True
# b    False
# c    False
# d    False
# e    False
# f     True
# dtype: bool

#fitler the duplicated
mask = ser1.duplicated(keep=False)
ser1[mask]
# a    10
# f    10
# dtype: int64

#who were not duplicated
ser1[~mask]
# b    20
# c    30
# d    40
# e    50
# dtype: int64

ser1.drop_duplicates(keep=False) //immutable
# b    20
# c    30
# d    40
# e    50
# dtype: int64

#add the duplicated keys
ser_key_dups = ser.append(pd.Series([100], index=['a']))
# a     10
# b     20
# c     30
# d     40
# e     50
# a    100
# dtype: int64

ser_key_dups['a']
# a     10
# a    100
# dtype: int64

Work with NaN

import pandas as pd

s= [10,20,30,40,None]
ser = pd.Series(s, index=['a','b','c','d','e'])
# a    10.0
# b    20.0
# c    30.0
# d    40.0
# e     NaN
# dtype: float64

s1= [10,20,30,40,50]
ser1 = pd.Series(s1, index=['a','b','c','d',None])
# a      10
# b      20
# c      30
# d      40
# NaN    50
# dtype: int64

ser2 = ser1.append( pd.Series([None], index=['f']))
# 0    None
# dtype: object

len(ser2)
#6

ser2.count()
#5

ser2.isnull()
# a      False
# b      False
# c      False
# d      False
# NaN    False
# f       True
# dtype: bool

ser2.isnull().any()
#True

ser2.isnull().all()
#False

mask = ser2.isnull()
# a      False
# b      False
# c      False
# d      False
# NaN    False
# f       True
# dtype: bool
ser2[mask]
# f    None
# dtype: object

ser2.dropna()
# a      10
# b      20
# c      30
# d      40
# NaN    50
# dtype: object

ser2.fillna(method='ffill')
# a      10
# b      20
# c      30
# d      40
# NaN    50 --fill the --\
# f      50 <------------/
# dtype: int64

import numpy as np
pd.Series([np.nan])
# 0   NaN
# dtype: float64

Series -> csv -> Data Frame

it is interesting to save series to csv file and read the series back. You can create Data Frame from the same csv file.

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

s= [10,20,5,40,50]
ser = pd.Series(s, name='dummy' ,index=['a','b','c','d','e'])

ser.name
ax = ser.plot()
ax.set_xlabel("x label")
ax.set_ylabel("y label")

#save series to file
ser.to_csv('test.csv', header=True, index_label='no#')
#read from a file
serff= pd.Series.from_csv('test.csv')
#read with the headers
serffh = pd.Series.from_csv('test.csv',header=0)
# no#
# a    10
# b    20
# c     5
# d    40
# e    50
# Name: dummy, dtype: int64

#read as data frame (df)
df = pd.read_csv('test.csv')
# no#   dummy
# 0 a   10
# 1 b   20
# 2 c   5
# 3 d   40
# 4 e   50

ser_from_df = df['dummy']
# 0    10
# 1    20
# 2     5
# 3    40
# 4    50
# Name: dummy, dtype: int64

Appendix

Mortgage payment equation

principal = 400000
rate = 0.0535
monthly_rate= rate/12
payment_period = 30
no_of_installments = payment_period * 12
r = math.pow((monthly_rate+1),no_of_installments)
payment = (monthly_rate * r)/(r - 1) * principal
print payment

here is principal, monthly rate, is number of instalments.

Comments

Popular posts from this blog

Parse the namespace based XML using Python

Blog Writing Workflows

Markdown blog writer for blogger