MySQL Handle Duplicates

Overview

Duplicate data can cause data integrity issues and performance problems. MySQL provides several methods to find, prevent, and handle duplicate records in tables.

Duplicate Types

  • Exact Duplicates: All columns identical
  • Partial Duplicates: Some columns identical
  • Primary Key Duplicates: Duplicate primary keys
  • Unique Key Duplicates: Duplicate unique constraints
  • Business Duplicates: Logical duplicates (same person multiple times)

Preventing Duplicates

PRIMARY KEY Constraint

-- Prevent exact row duplicates
CREATE TABLE users (
    id INT PRIMARY KEY AUTO_INCREMENT,
    email VARCHAR(255) UNIQUE,
    name VARCHAR(100)
);

-- Insert attempt with duplicate primary key fails
INSERT INTO users (id, name, email)
VALUES (1, 'John', 'john@example.com');
-- Error: Duplicate entry for key 'PRIMARY'

UNIQUE Constraint

-- Prevent duplicate values in specific columns
CREATE TABLE emails (
    id INT PRIMARY KEY AUTO_INCREMENT,
    email VARCHAR(255) UNIQUE,
    user_id INT
);

-- Insert duplicate email fails
INSERT INTO emails (email, user_id)
VALUES ('john@example.com', 1);
-- Error: Duplicate entry for key 'email'

Composite Unique Constraint

-- Prevent combination duplicates
CREATE TABLE user_emails (
    id INT PRIMARY KEY AUTO_INCREMENT,
    user_id INT,
    email_type VARCHAR(20),
    email VARCHAR(255),
    UNIQUE (user_id, email_type)
);

-- Insert duplicate user_id + email_type fails
INSERT INTO user_emails (user_id, email_type, email)
VALUES (1, 'work', 'john@example.com'),
       (1, 'work', 'jane@example.com');
-- Error: Duplicate entry for key 'user_id'

Finding Duplicates

Group By Method

-- Find duplicates by specific column
SELECT 
    email,
    COUNT(*) AS duplicate_count
FROM users
GROUP BY email
HAVING COUNT(*) > 1;

-- Find all columns that are duplicates
SELECT 
    email,
    name,
    COUNT(*) AS duplicate_count
FROM users
GROUP BY email, name
HAVING COUNT(*) > 1;

Subquery Method

-- Find duplicate records
SELECT u.*
FROM users u
WHERE email IN (
    SELECT email
    FROM users
    GROUP BY email
    HAVING COUNT(*) > 1
);

-- Show one instance per duplicate
SELECT u.*
FROM users u
WHERE u.id IN (
    SELECT MIN(id)
    FROM users
    WHERE email = u.email
);

Self JOIN Method

-- Find duplicate records using self-join
SELECT u1.id, u1.email, u1.name
FROM users u1
JOIN users u2 ON u1.email = u2.email AND u1.id > u2.id;

-- Count duplicates per value
SELECT 
    u1.email,
    COUNT(*) AS duplicate_count
FROM users u1
JOIN users u2 ON u1.email = u2.email AND u1.id > u2.id
GROUP BY u1.email;

Removing Duplicates

Delete Using Self JOIN

-- Keep the lowest ID for each duplicate
DELETE u1
FROM users u1
JOIN users u2 ON u1.email = u2.email AND u1.id > u2.id;

-- Check deleted count
SELECT ROW_COUNT();

Delete Using Subquery

-- Delete duplicates, keep first occurrence
DELETE FROM users
WHERE id NOT IN (
    SELECT MIN(id)
    FROM users
    GROUP BY email
);

-- Delete duplicates, keep highest ID
DELETE FROM users
WHERE id NOT IN (
    SELECT MAX(id)
    FROM users
    GROUP BY email
);

Delete Using Temporary Table

-- Create temporary table with unique records
CREATE TEMPORARY TABLE temp_unique_users AS
SELECT 
    MIN(id) AS id,
    email,
    name
FROM users
GROUP BY email, name;

-- Truncate original table
TRUNCATE TABLE users;

-- Insert unique records
INSERT INTO users (id, email, name)
SELECT * FROM temp_unique_users;

INSERT IGNORE / INSERT IGNORE

Skip Duplicate Errors

-- Ignore duplicate errors
INSERT IGNORE INTO users (email, name)
VALUES ('john@example.com', 'John'),
       ('jane@example.com', 'Jane'),
       ('john@example.com', 'John Duplicate');  -- Skipped

-- Check affected rows
SELECT ROW_COUNT();  -- Shows 2 (1 skipped)

REPLACE INTO / REPLACE INTO

Replace Existing Records

-- Replace if duplicate key exists
REPLACE INTO users (id, email, name)
VALUES (1, 'john@example.com', 'John Updated');

-- Equivalent to DELETE + INSERT
-- If id=1 exists, delete it, then insert new record

ON DUPLICATE KEY UPDATE / ON DUPLICATE KEY UPDATE

Update on Duplicate

-- Update if duplicate key exists
INSERT INTO users (email, name)
VALUES ('john@example.com', 'John Updated')
ON DUPLICATE KEY UPDATE 
    name = VALUES(name),
    updated_at = NOW();

-- Insert with multiple values
INSERT INTO users (email, name, phone)
VALUES 
    ('john@example.com', 'John', '555-1234'),
    ('jane@example.com', 'Jane', '555-5678'),
    ('bob@example.com', 'Bob', '555-9876')
ON DUPLICATE KEY UPDATE 
    name = VALUES(name),
    phone = VALUES(phone);

Advanced ON DUPLICATE KEY UPDATE

-- Increment counter on duplicate
INSERT INTO page_views (page_url, view_count)
VALUES ('/home', 1)
ON DUPLICATE KEY UPDATE 
    view_count = view_count + 1;

-- Conditional update
INSERT INTO users (email, name, status)
VALUES ('john@example.com', 'John', 'active')
ON DUPLICATE KEY UPDATE 
    status = CASE 
        WHEN VALUES(status) != status 
        THEN VALUES(status) 
        ELSE status 
    END,
    updated_at = NOW();

Practical Examples

Example 1: Clean Contact List

-- Find duplicate emails
SELECT 
    email,
    COUNT(*) AS dup_count,
    GROUP_CONCAT(id) AS duplicate_ids
FROM contacts
GROUP BY email
HAVING COUNT(*) > 1;

-- Remove duplicates, keep most recent
DELETE c1
FROM contacts c1
JOIN contacts c2 ON c1.email = c2.email 
    AND c1.created_at < c2.created_at;

Example 2: Merge Duplicate Records

-- Merge duplicate users
CREATE TEMPORARY TABLE temp_merged_users AS
SELECT 
    MIN(id) AS keep_id,
    GROUP_CONCAT(id) AS remove_ids,
    email,
    MAX(name) AS name,
    MAX(phone) AS phone
FROM users
GROUP BY email
HAVING COUNT(*) > 1;

-- Create table for merged records
CREATE TEMPORARY TABLE temp_merged_final AS
SELECT * FROM temp_merged_users;

-- Insert merged records
INSERT INTO users (id, email, name, phone)
SELECT * FROM temp_merged_final;

-- Delete old duplicates
DELETE FROM users 
WHERE id IN (SELECT remove_ids FROM temp_merged_users);

Example 3: Import Data

-- Import with duplicate handling
CREATE TEMPORARY TABLE temp_import AS
SELECT * FROM imported_data;

-- Import unique records
INSERT IGNORE INTO users (email, name)
SELECT email, name FROM temp_import;

-- Or update existing records
INSERT INTO users (email, name, phone)
SELECT email, name, phone FROM temp_import
ON DUPLICATE KEY UPDATE 
    name = VALUES(name),
    phone = VALUES(phone);

Example 4: Daily Deduplication

-- Create deduplicated view
CREATE OR REPLACE VIEW v_unique_users AS
SELECT 
    MIN(id) AS id,
    email,
    MAX(name) AS name
FROM users
GROUP BY email;

-- Use view instead of table
SELECT * FROM v_unique_users;

Duplicate Prevention Strategies

Database Design

-- Add unique constraints
ALTER TABLE users ADD UNIQUE INDEX idx_users_email (email);
ALTER TABLE users ADD UNIQUE INDEX idx_users_phone (phone);

-- Add composite unique for business logic
ALTER TABLE orders 
ADD UNIQUE INDEX idx_orders_user_product (user_id, product_id, order_date);

Application Logic

-- Check before insert
-- Step 1: Check exists
SELECT COUNT(*) INTO @exists 
FROM users 
WHERE email = 'john@example.com';

-- Step 2: Insert if not exists
IF @exists = 0 THEN
    INSERT INTO users (email, name) 
    VALUES ('john@example.com', 'John');
END IF;

Triggers for Prevention

-- Prevent duplicate inserts
DELIMITER //
CREATE TRIGGER before_user_insert
BEFORE INSERT ON users
FOR EACH ROW
BEGIN
    DECLARE dup_count INT;
    
    SELECT COUNT(*) INTO dup_count
    FROM users
    WHERE email = NEW.email;
    
    IF dup_count > 0 THEN
        SIGNAL SQLSTATE '45000'
        SET MESSAGE_TEXT = 'Duplicate email not allowed';
    END IF;
END //
DELIMITER ;

Monitoring Duplicates

Duplicate Detection Query

-- Find duplicates across all tables
SELECT 
    TABLE_NAME,
    COLUMN_NAME,
    COUNT(*) AS duplicate_count
FROM information_schema.COLUMNS c
JOIN (
    SELECT email, COUNT(*) AS dup_cnt
    FROM users
    GROUP BY email
    HAVING COUNT(*) > 1
) d ON d.email = c.COLUMN_NAME;

Regular Duplicate Check

-- Create procedure to check duplicates
DELIMITER //
CREATE PROCEDURE check_duplicates()
BEGIN
    SELECT 
        'users' AS table_name,
        email AS duplicate_field,
        COUNT(*) AS count
    FROM users
    GROUP BY email
    HAVING COUNT(*) > 1;
    
    SELECT 
        'orders' AS table_name,
        order_number AS duplicate_field,
        COUNT(*) AS count
    FROM orders
    GROUP BY order_number
    HAVING COUNT(*) > 1;
END //
DELIMITER ;

-- Run regularly
CALL check_duplicates();

Best Practices

Choosing Deduplication Method

-- Use INSERT IGNORE for:
-- - Simple insert operations
-- - Want to skip duplicates silently

-- Use REPLACE for:
-- - Complete row replacement
-- - Need to delete old record completely

-- Use ON DUPLICATE KEY UPDATE for:
-- - Update existing records
-- - Merge data
-- - Keep history with update timestamp

Performance Considerations

-- Large deduplication operations
-- Use temporary tables for better performance
CREATE TEMPORARY TABLE temp_dedup AS
SELECT * FROM large_table;

-- Add indexes to temporary table
CREATE INDEX idx_temp_dedup_email ON temp_dedup(email);

-- Delete in batches
DELETE FROM temp_dedup 
WHERE id IN (
    SELECT MIN(id) 
    FROM temp_dedup 
    GROUP BY email 
    HAVING COUNT(*) > 1 
    LIMIT 10000
);

Summary

Handling duplicates in MySQL involves:

  • Prevention: PRIMARY KEY, UNIQUE constraints
  • Detection: GROUP BY, self-joins, subqueries
  • Removal: DELETE with joins, temporary tables
  • Insert Handling: INSERT IGNORE, REPLACE, ON DUPLICATE KEY UPDATE
  • Monitoring: Regular checks for data quality

Choose appropriate method based on your data integrity requirements and performance needs.


Previous: Sequences

Next: SQL Injection