Friday, April 2, 2010

Clean the table before deleting duplicates

Clean foreign tables and delete duplicates in referenced table, 8 steps











Test database tables:
create table item
(
item_id serial not null,
item text not null,
constraint pk_item primary key(item_id),
constraint uk_item unique (item)
);

   
create table purchased
(
purchased_id serial not null,
item_id int not null,
qty int not null,
constraint pk_purchased primary key(purchased_id),
constraint fk_purchased__item foreign key(item_id) references item(item_id) 
);

Test data:
insert into item (item) values
('cpu'),
('keyboard'),
('keyboard '),
('mouse');

insert into purchased(item_id,qty) values
(1,2),
(2,26),
(3,19),
(4,51),
(3,5);



test=# select * from item;
 item_id |   item    
---------+-----------
       1 | cpu
       2 | keyboard
       3 | keyboard 
       4 | mouse
(4 rows)

test=# select * from purchased;
 purchased_id | item_id | qty 
--------------+---------+-----
            1 |       1 |   2
            2 |       2 |  26
            3 |       3 |  19
            4 |       4 |  51
            5 |       3 |   5
(5 rows)



Reporting summary on items:
select i.item_id, i.item, sum(p.qty) 
from purchased p 
join item i using(item_id) 
group by i.item_id
    , i.item
order by i.item

Output:
item_id |   item    | sum 
---------+-----------+-----
       1 | cpu       |   2
       2 | keyboard  |  26
       3 | keyboard  |  24
       4 | mouse     |  51
(4 rows)
Something is wrong, not with your query, but with the data


CLEAN AND DELETE DUPLICATE DATA. 8 STEPS

Step 1 of 8 - Identify the duplicates:
select trim(lower(i.item)) as unified_name, min(item_id) as unified_id
from item i
group by unified_name
having count(*) > 1
order by unified_name

unified_name | unified_id 
--------------+------------
 keyboard     |          2
(1 row)


Step 2 of 8 - Identify purchases that belongs to duplicated item:
select p.purchased_id, unified_item.unified_id
from purchased p 
join item i using(item_id)
join 
(
 select trim(lower(i.item)) as unified_name, min(item_id) as unified_id
 from item i
 group by unified_name
 having count(*) > 1 
 order by unified_name
) as unified_item on unified_item.unified_name = trim(lower(i.item))

purchased_id | unified_id 
--------------+------------
            2 |          2
            3 |          2
            5 |          2
(3 rows)

Step 3 of 8 - Update those purchases that belongs to duplicated item_id with the unified id:
update purchased set item_id = unified_item.unified_id  

 -- move the sql from step 2 here, then remove the select clause
 -- select p.purchased_id, unified_item.unified_id
 from purchased p
 join item i on i.item_id = p.item_id
 join 
 (
  select trim(lower(i.item)) as unified_name, min(item_id) as unified_id
  from item i
  group by unified_name
  having count(*) > 1 
  order by unified_name
 ) as unified_item 
 on unified_item.unified_name = trim(lower(i.item))
 
where p.purchased_id = purchased.purchased_id

UPDATE 3

Step 4 of 8 - Verify Correctness:
select i.item_id, i.item, sum(p.qty) 
from purchased p 
join item i using(item_id) 
group by i.item_id
    , i.item
order by i.item

 item_id |   item   | sum 
---------+----------+-----
       1 | cpu      |   2
       2 | keyboard |  50
       4 | mouse    |  51
(3 rows)

Step 5 of 8 - Delete duplicates
select * from item;

 item_id |   item    
---------+-----------
       1 | cpu
       2 | keyboard
       3 | keyboard 
       4 | mouse
(4 rows)


delete from item where item_id not in 
(select min(item_id) from item group by trim(lower(item)))

DELETE 1

Step 6 of 8 - Verify correctness:
select * from item;

 item_id |   item   
---------+----------
       1 | cpu
       2 | keyboard
       4 | mouse
(3 rows)

Step 7 of 8 - Future-proof uniqueness:
create unique index ux_item on item using btree(trim(lower(item))) 

Step 8 of 8 - Verify future-proofing (note that extra spaces there):
insert into item(item) values('keyboard  ');

ERROR:  duplicate key value violates unique constraint "ux_item"

select * from item;

 item_id |   item   
---------+----------
       1 | cpu
       2 | keyboard
       4 | mouse
(3 rows)

1 comment: