#!/usr/bin/perl -w # # Zymonic Business Process and Information Management System # Copyright Zednax Limited 2008 - # For Authors and Changelog see the subversion history # error handling BEGIN { use Zymonic::Utils qw(death_handler clean); $main::SIG{__DIE__} = \&death_handler; } # Modules use Zymonic; use Zymonic::Config; $Zymonic::system = ''; $Zymonic::session = ''; $Zymonic::system = clean( shift @ARGV, '_' ) || ''; my $table_name = clean( shift @ARGV, '_' ) || ''; my $id_field = clean( shift @ARGV, '_' ) || ''; my $key_fields_in = clean( shift @ARGV, '_,' ) || ''; my $oldest = clean( shift @ARGV ) || ''; my $real_delete = clean( shift @ARGV ) || ''; my @key_fields = split( ',', $key_fields_in ); unless ( $Zymonic::system && $table_name && $id_field && $key_fields_in ) { print "Usage: remove_table_duplicates.pl [system] [table name] [id field] [fields] [oldest flag *] [real delete *]\n" . "\t *=optional\n" . "\t system - zymonic system to act on\n" . "\t table name - name of the table to remove duplicates from\n" . "\t id field - unique id field of each record in the table\n" . "\t fields - comma seperated list of fields in the table to use to identical duplicates\n" . "\t oldest flag - by default this will keep the most recent record, set this flag to keep the oldest\n" . "\t real delete - by default the duplicate record will just be marked as deleted, set this flag to actually remove them\n"; exit(0); } $Zymonic::ZCONFIG{$Zymonic::system} = Zymonic::Config->new( system_name => $Zymonic::system, config_dir => "/etc/zymonic", ip_address => '127.0.0.1', protocol => 'http' ); my $db = $Zymonic::ZCONFIG{$Zymonic::system}->{DB}; # find duplicates, those with more than 1 record my $field_list = join( ', ', @key_fields ); my $record_counts = $db->run_query( { string => "SELECT $field_list, COUNT(*) AS zz_count FROM $table_name GROUP BY $field_list HAVING COUNT(*) > 1", params => [], } ); # for each found remove the duplicates foreach my $record ( @{$record_counts} ) { print "Found $record->{zz_count} duplicates where: " . join( ', ', map { $_ . '=' . $record->{$_} } @key_fields ) . "\n"; my $duplicated_records = $db->run_query( { string => "SELECT $id_field, $field_list FROM $table_name WHERE " . join( ' AND ', map { $_ . '= ?' } @key_fields ) . ' ORDER BY zzlu ' . ( $oldest ? 'ASC' : 'DESC' ), params => [ map { $record->{$_} } @key_fields ], } ); # delete all but the first record (this will be oldest/newest from order in SQL) shift @{$duplicated_records}; my @remaining_ids = map { $_->{$id_field} } @{$duplicated_records}; my $batch_size = 200; while (@remaining_ids) { # do this in batches so we don't break DBI by having too many binding params my @ids = splice( @remaining_ids, 0, $batch_size ); my $result; if ($real_delete) { $result = $db->run_statement( { string => "DELETE FROM $table_name WHERE $id_field IN (" . join( ', ', map { '?' } @ids ) . ')', params => [@ids], } ); } else { $result = $db->run_statement( { string => "UPDATE $table_name SET deleted = ? WHERE $id_field IN (" . join( ', ', map { '?' } @ids ) . ')', params => [ 'Y', @ids ], } ); } print 'Deleting ' . ( $real_delete ? '(really!) ' : '' ) . ( scalar @ids ) . " records, result: $result\n"; } }