python fun
This blog is a reference guide to the analytics which are suppose to address in upcoming blogs.
Python
This section covers only the Python
Sort with Lambda
As first, I would like to introduce how lambda can be used to sort the dictionary of word count (wc) based on the key or the value(name of the fruit and the count)
__author__ = 'ojitha'
wc ={'orange':2, 'mango':1, 'cherry':8, 'apple':5}
'''sort on key'''
print(sorted (wc.items(), key=lambda (word, count): word))
''' sort on the count'''
print(sorted (wc.items(), key=lambda (word, count): count))
The output of the above code is as follows
[('apple', 5), ('cherry', 8), ('mango', 1), ('orange', 2)]
[('mango', 1), ('orange', 2), ('apple', 5), ('cherry', 8)]
List Comprehensions
List of even and odds can be created as follows
listOfEvens = [x for x in range(10) if x %2 == 0]
print (listOfEvens)
listOfOdds = [x for x in range(10) if x %2 != 0]
print (listOfOdds)
The list comprehension is as follows
pairs = [(x,y) for x in range(5) for y in range(5)]
print(pairs)
This will create pairs as follows
[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (4, 0), (4, 1), (4, 2), (4, 3), (4, 4)
For example, consider the following csv dataset:
name,age,salary
Mike,20,3000
Ren,30,4050.41
Tom,25,2500.34
Hanks,40,5000.45
Following program reading the above data set to find the names of employees who’s salary is above 3000.
import csv
def read_csv(file_name):
records = []
with open(file_name,'r') as f:
rows = csv.reader(f)
headers=next(rows)
for row in rows:
rec = {}
for i, val in enumerate(row):
rec[headers[i]] = val
records.append(rec)
return records
dics = read_csv('t1.csv')
len(dics)
r1 = dics[1]
r1['salary']
total_salary= sum(float(r['salary']) for r in dics)
#list Comprehension
names = [r['name'] for r in dics if float(r['salary'])>3000]
print ' and '.join(names)
As list, you can do the comprehension for the other data structures such as dictionaries.
The dictionary of above code can be sorted as follows:
def sorting_key(rec):
return rec['name']
dics.sort(key=sorting_key)
#or use the lambda
dics.sort(key = lambda x: x['name'])
map function
map function execute the function on sequence:
def multi(l,r):
return l * r
x = [1,2,3,4]
y = [10,20,30,40]
print(map(multi,x, y))
#output : [10, 40, 90, 160]
filter function
Here the filter which execute logical operation on sequence
def is_even(x):
return x % 2 == 0
x = [1,2,3,4,5,6,7,8,14,31,45]
print(filter(is_even, x))
#output [2, 4, 6, 8, 14]
reduce function
def multi(x, y): return x * y
x = [1,2,3,4]
print(reduce(multi, x))
#output: 24
pythonic way of enumeration
x = ['a','b','c','d']
for i, j in enumerate(x):
print i, j
zip and unzip
Here the example code to zip and unzip the lists
x = ['a','b','c','d']
y = [1,2,3,4]
z = ['p','q','r','s']
l = zip(x,y,z)
#zip
print l
#unzip
p,q,r = zip(*l)
print p
print q
print r
Output as follows
[('a', 1, 'p'), ('b', 2, 'q'), ('c', 3, 'r'), ('d', 4, 's')]
('a', 'b', 'c', 'd')
(1, 2, 3, 4)
('p', 'q', 'r', 's')
Json
Here the Json transferring using dictionary:
import json
rec = {'name':'Mike', 'age':45}
j = json.dumps(rec)
r = json.loads(j)
Pandas
Basics
Read the csv file and create a histogram is very simple in Pandas.
mport pandas as pd
df = pd.read_csv('t1.csv')
%matplotlib inline
df['salary'].hist()
Series
Pandas series are the most important, Here the data types:
import pandas as pd
serInt = pd.Series(range(1))
serInt.dtype
#dtype('int64')
serFloat = pd.Series([1.0,2.0])
serFloat.dtype
#dtype('float64')
setStr = pd.Series(['a','b'])
setStr.dtype
#dtype('O')
serdictInt = pd.Series({'one':1, 'two':2})
serdictInt.dtype
#dtype('int64')
serDictFloat = pd.Series({'one':1.0, 'two':2.0})
serDictFloat.dtype
#dtype('float64')
sertDictStr = pd.Series({'one':'I', 'two':'II'})
sertDictStr.dtype
#dtype('O')
sertDictStrKey = pd.Series({'one':'I', 'two':'II'})
sertDictStrKey.dtype
#dtype('O')
serDate = pd.Series([pd.to_datetime('2016-01-01')])
serDate.dtype
#dtype('<M8[ns]')
serCategorical = pd.Series(['a','b'],dtype='category')
serCategorical.dtype
#category
Iterating series
import pandas as pd
s = pd.Series([10,20,30,40])
for num in s:
print(num)
# 10
# 20
# 30
# 40
1 in s
#True
10 in s
#False: because only index is looked
10 in s.values
#True or use the following
10 in set(s)
#True
#convert to dictionary
d = dict(s)
#{0: 10, 1: 20, 2: 30, 3: 40}
#directly iterate series
for lab, val in s.iteritems():
print (lab, val)
# (0, 10)
# (1, 20)
# (2, 30)
# (3, 40)
#samething in dictionary
for lab, val in dict(s).iteritems():
print (lab, val)
# (0, 10)
# (1, 20)
# (2, 30)
# (3, 40)
Broadcasting
import pandas as pd
s= [10,20,30]
ser = pd.Series(s)
ser + 2
# 0 12
# 1 22
# 2 32
# dtype: int64
#s + 2 never work in python
# but
s * 2
#[10, 20, 30, 10, 20, 30]
#completely different idea from the standard python.
ser * 2
# 0 20
# 1 40
# 2 60
# dtype: int64
#s + ser or
ser + s
# 0 20
# 1 40
# 2 60
# dtype: int64
#create second series
s1 = [40,50,60]
ser1 = pd.Series(s1)
ser + ser1
# 0 50
# 1 70
# 2 90
# dtype: int64
ser3 = pd.Series(s1, index=[2,3,4])
# 2 40
# 3 50
# 4 60
# dtype: int64
#only the instersecting elements are added rest of the other are NaN
ser + ser3
# 0 NaN
# 1 NaN
# 2 70.0
# 3 NaN
# 4 NaN
# dtype: float64
def disp(val):
return val
ser.apply(disp)
# 0 10
# 1 20
# 2 30
# dtype: int64
def add_2(val):
return val + 2
ser.apply(add_2)
# 0 12
# 1 22
# 2 32
# dtype: int64
#to convert to float
ser.apply(float)
# 0 10.0
# 1 20.0
# 2 30.0
# dtype: float64
#but better to use
ser.astype(str)
# 0 10
# 1 20
# 2 30
# dtype: object
Series position and label
import pandas as pd
s= [10,20,30]
ser = pd.Series(s)
s[0]
ser[0]
#10
#ser[-1] not work raised KeyError because there is no label with -1.
s[-1]
#30
#s[4] index out of range
#ser[4] is keyError
#s.loc[0] not work but:
ser.loc[0] #value based on the label
#10
#position based
#ser.loc[-1] not work but:
ser.iloc[-1]
#30
#ser.iloc[4] is IndexError
#but this is different from the string indexes,
#series use label always, not index. This is the proof
#character index
a = [1,2,3,4,5]
ser2 = pd.Series(a, name='ser2', index=['a','b','c','d','e'])
a[-1]
ser2[-1]
ser2.iloc[-1] #position based
#5
# ser2.loc[-1] label error
# ser2.loc[0] label error. But:
ser2.loc['a'] #label based
#1
#label based examples
ser3 = pd.Series(a, name='ser3', index = ['a','b',0,1,2])
ser3.iloc[0] #position
ser3['a']
#1
ser3.iloc[2] #position
ser3[0]
#3
Update Series
import pandas as pd
s= [10,20,30]
ser = pd.Series(s, index=['a','b','c'])
# a 10
# b 20
# c 30
# dtype: int64
#update the position
ser['c'] = 40
# a 10
# b 20
# c 40
# dtype: int64
ser.iloc[0] = 5
# a 5
# b 20
# c 30
# dtype: int64
#add series
ser.append(pd.Series([100])) #new series returned but not mutate the existing.
# a 5
# b 20
# c 30
# 0 100
# dtype: int64
ser.set_value('a',-10) #mutate the series
# a -10
# b 20
# c 30
# dtype: int64
#mutate with new values
ser.set_value('d',50)
# a -10
# b 20
# c 30
# d 50
# dtype: int64
py
Delete and filter Series
import pandas as pd
d1 = {'one': 1, 'two':2}
#{'one': 1, 'two': 2}
del d1['two']
#{'one': 1}
s= [10,20,30]
ser = pd.Series(s, index=['a','b','c'])
# a 10
# b 20
# c 30
dtype: int64
del ser['b'] #mutated
# a 10
# c 30
# dtype: int64
#masking
mask = ser > 20
# a False
# c True
# dtype: bool
ser[mask]
# c 30
# dtype: int64
ser.index
#Index([u'a', u'c'], dtype='object')
#masking the index
mask = ser.index == 'a'
#array([ True, False], dtype=bool)
ser[mask]
# a 10
# dtype: int64
Summaries and Duplicates
import pandas as pd
s= [10,20,30,40,50]
ser = pd.Series(s, index=['a','b','c','d','e'])
ser.describe()
# count 5.000000
# mean 30.000000
# std 15.811388
# min 10.000000
# 25% 20.000000
# 50% 30.000000
# 75% 40.000000
# max 50.000000
# dtype: float64
ser.value_counts()
# 30 1
# 20 1
# 10 1
# 50 1
# 40 1
# dtype: int64
#categorical data
cat= ['mango','apple','banna','grape','date','apple','pinapple','apple','banna']
catser = pd.Series(cat, dtype='category')
catser.describe()
# count 9
# unique 6
# top apple
# freq 3
# dtype: object
catser.value_counts()
# apple 3
# banna 2
# pinapple 1
# mango 1
# grape 1
# date 1
# dtype: int64
ser.duplicated()
# a False
# b False
# c False
# d False
# e False
# dtype: bool
ser1 = ser.append(pd.Series([10],index=['f']))
# a 10
# b 20
# c 30
# d 40
# e 50
# f 10
# dtype: int64
ser1.duplicated() #last is duplicated with first one
# a False
# b False
# c False
# d False
# e False
# f True
# dtype: bool
ser.duplicated().any()
#False
ser1.duplicated().any()
#True
ser1.duplicated().all()
#False
ser1.duplicated(keep='last')
# a True
# b False
# c False
# d False
# e False
# f False
# dtype: bool
ser1.duplicated(keep=False)
# a True
# b False
# c False
# d False
# e False
# f True
# dtype: bool
#fitler the duplicated
mask = ser1.duplicated(keep=False)
ser1[mask]
# a 10
# f 10
# dtype: int64
#who were not duplicated
ser1[~mask]
# b 20
# c 30
# d 40
# e 50
# dtype: int64
ser1.drop_duplicates(keep=False) //immutable
# b 20
# c 30
# d 40
# e 50
# dtype: int64
#add the duplicated keys
ser_key_dups = ser.append(pd.Series([100], index=['a']))
# a 10
# b 20
# c 30
# d 40
# e 50
# a 100
# dtype: int64
ser_key_dups['a']
# a 10
# a 100
# dtype: int64
Work with NaN
import pandas as pd
s= [10,20,30,40,None]
ser = pd.Series(s, index=['a','b','c','d','e'])
# a 10.0
# b 20.0
# c 30.0
# d 40.0
# e NaN
# dtype: float64
s1= [10,20,30,40,50]
ser1 = pd.Series(s1, index=['a','b','c','d',None])
# a 10
# b 20
# c 30
# d 40
# NaN 50
# dtype: int64
ser2 = ser1.append( pd.Series([None], index=['f']))
# 0 None
# dtype: object
len(ser2)
#6
ser2.count()
#5
ser2.isnull()
# a False
# b False
# c False
# d False
# NaN False
# f True
# dtype: bool
ser2.isnull().any()
#True
ser2.isnull().all()
#False
mask = ser2.isnull()
# a False
# b False
# c False
# d False
# NaN False
# f True
# dtype: bool
ser2[mask]
# f None
# dtype: object
ser2.dropna()
# a 10
# b 20
# c 30
# d 40
# NaN 50
# dtype: object
ser2.fillna(method='ffill')
# a 10
# b 20
# c 30
# d 40
# NaN 50 --fill the --\
# f 50 <------------/
# dtype: int64
import numpy as np
pd.Series([np.nan])
# 0 NaN
# dtype: float64
Series -> csv -> Data Frame
it is interesting to save series to csv file and read the series back. You can create Data Frame from the same csv file.
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
s= [10,20,5,40,50]
ser = pd.Series(s, name='dummy' ,index=['a','b','c','d','e'])
ser.name
ax = ser.plot()
ax.set_xlabel("x label")
ax.set_ylabel("y label")
#save series to file
ser.to_csv('test.csv', header=True, index_label='no#')
#read from a file
serff= pd.Series.from_csv('test.csv')
#read with the headers
serffh = pd.Series.from_csv('test.csv',header=0)
# no#
# a 10
# b 20
# c 5
# d 40
# e 50
# Name: dummy, dtype: int64
#read as data frame (df)
df = pd.read_csv('test.csv')
# no# dummy
# 0 a 10
# 1 b 20
# 2 c 5
# 3 d 40
# 4 e 50
ser_from_df = df['dummy']
# 0 10
# 1 20
# 2 5
# 3 40
# 4 50
# Name: dummy, dtype: int64
Appendix
Mortgage payment equation
principal = 400000
rate = 0.0535
monthly_rate= rate/12
payment_period = 30
no_of_installments = payment_period * 12
r = math.pow((monthly_rate+1),no_of_installments)
payment = (monthly_rate * r)/(r - 1) * principal
print payment
here is principal, monthly rate, is number of instalments.
Comments
Post a Comment
commented your blog