Python tip to group arrays
Grouping is one of the most important in data cleansing. In the Python, itertools package is one of the most important. For example,
Following source shows how to group the array by name
and age
using Python build-in libraries.
import itertools
from operator import itemgetter
import json
import StringIO
import gzip
import logging
from module import mytest
source = [
{'name':'z', 'age':21, 'other':'z-21'},
{'name':'z', 'age':21, 'other':'z-21-duplicated-1'},
{'name':'z', 'age':21, 'other':'z-21-duplicated-2'},
{'name':'z', 'age':20, 'other':'z-20'},
{'name':'c', 'age':31, 'other':'c-31'},
{'name':'c', 'age':30, 'other':'c-30'},
]
grouper = itemgetter('name','age')
s = sorted(source, key=grouper)
import itertools
from operator import itemgetter
import json
import StringIO
import gzip
import logging
from module import mytest
source = [
{'name':'z', 'age':21, 'other':'z-21'},
{'name':'z', 'age':21, 'other':'z-21-duplicated-1'},
{'name':'z', 'age':21, 'other':'z-21-duplicated-2'},
{'name':'z', 'age':20, 'other':'z-20'},
{'name':'c', 'age':31, 'other':'c-31'},
{'name':'c', 'age':30, 'other':'c-30'},
]
grouper = itemgetter('name','age')
s = sorted(source, key=grouper)
for key, group in itertools.groupby(s, key=grouper):
print('Key : {}'.format(key))
print(list(group))
# for key, group in itertools.groupby(s, key=grouper):
# with gzip.GzipFile('data/{}-{}.gz'.format(key[0],key[1]), 'wb') as gzipped_file:
# for r in group:
# print(r)
# gzipped_file.write(json.dumps(r))
Here the result which has been grouped by name and age:
Key : ('c', 30)
[{'age': 30, 'other': 'c-30', 'name': 'c'}]
Key : ('c', 31)
[{'age': 31, 'other': 'c-31', 'name': 'c'}]
Key : ('z', 20)
[{'age': 20, 'other': 'z-20', 'name': 'z'}]
Key : ('z', 21)
[ {'age': 21, 'other': 'z-21', 'name': 'z'}
, {'age': 21, 'other': 'z-21-duplicated-1', 'name': 'z'}
, {'age': 21, 'other': 'z-21-duplicated-2', 'name': 'z'}
]
Comments
Post a Comment
commented your blog