Python tip to group arrays

Grouping is one of the most important in data cleansing. In the Python, itertools package is one of the most important. For example,

Following source shows how to group the array by name and age using Python build-in libraries.

import itertools
from operator import itemgetter
import json
import StringIO
import gzip
import logging
from module import mytest

source = [
    {'name':'z', 'age':21, 'other':'z-21'},
    {'name':'z', 'age':21, 'other':'z-21-duplicated-1'},
    {'name':'z', 'age':21, 'other':'z-21-duplicated-2'},
    {'name':'z', 'age':20, 'other':'z-20'},
    {'name':'c', 'age':31, 'other':'c-31'},
    {'name':'c', 'age':30, 'other':'c-30'},

]

grouper = itemgetter('name','age')
s = sorted(source, key=grouper)

import itertools
from operator import itemgetter
import json
import StringIO
import gzip
import logging
from module import mytest

source = [
        {'name':'z', 'age':21, 'other':'z-21'},
        {'name':'z', 'age':21, 'other':'z-21-duplicated-1'},
        {'name':'z', 'age':21, 'other':'z-21-duplicated-2'},
        {'name':'z', 'age':20, 'other':'z-20'},
        {'name':'c', 'age':31, 'other':'c-31'},
        {'name':'c', 'age':30, 'other':'c-30'},

    ]


grouper = itemgetter('name','age')
s = sorted(source, key=grouper)

for key, group in itertools.groupby(s, key=grouper):
    print('Key : {}'.format(key))
    print(list(group))

# for key, group in itertools.groupby(s, key=grouper):
#     with gzip.GzipFile('data/{}-{}.gz'.format(key[0],key[1]), 'wb') as gzipped_file:
#         for r in group:
#             print(r)
#             gzipped_file.write(json.dumps(r))
           

Here the result which has been grouped by name and age:

Key : ('c', 30)
[{'age': 30, 'other': 'c-30', 'name': 'c'}]
Key : ('c', 31)
[{'age': 31, 'other': 'c-31', 'name': 'c'}]
Key : ('z', 20)
[{'age': 20, 'other': 'z-20', 'name': 'z'}]
Key : ('z', 21)
[ {'age': 21, 'other': 'z-21', 'name': 'z'}
, {'age': 21, 'other': 'z-21-duplicated-1', 'name': 'z'}
, {'age': 21, 'other': 'z-21-duplicated-2', 'name': 'z'}
]

Comments

Popular posts from this blog

How To: GitHub projects in Spring Tool Suite

Spring 3 Part 7: Spring with Databases

Parse the namespace based XML using Python