MySQL Handle Duplicates
Overview
Duplicate data can cause data integrity issues and performance problems. MySQL provides several methods to find, prevent, and handle duplicate records in tables.
Duplicate Types
- Exact Duplicates: All columns identical
- Partial Duplicates: Some columns identical
- Primary Key Duplicates: Duplicate primary keys
- Unique Key Duplicates: Duplicate unique constraints
- Business Duplicates: Logical duplicates (same person multiple times)
Preventing Duplicates
PRIMARY KEY Constraint
sql
-- Prevent exact row duplicates
CREATE TABLE users (
id INT PRIMARY KEY AUTO_INCREMENT,
email VARCHAR(255) UNIQUE,
name VARCHAR(100)
);
-- Insert attempt with duplicate primary key fails
INSERT INTO users (id, name, email)
VALUES (1, 'John', 'john@example.com');
-- Error: Duplicate entry for key 'PRIMARY'UNIQUE Constraint
sql
-- Prevent duplicate values in specific columns
CREATE TABLE emails (
id INT PRIMARY KEY AUTO_INCREMENT,
email VARCHAR(255) UNIQUE,
user_id INT
);
-- Insert duplicate email fails
INSERT INTO emails (email, user_id)
VALUES ('john@example.com', 1);
-- Error: Duplicate entry for key 'email'Composite Unique Constraint
sql
-- Prevent combination duplicates
CREATE TABLE user_emails (
id INT PRIMARY KEY AUTO_INCREMENT,
user_id INT,
email_type VARCHAR(20),
email VARCHAR(255),
UNIQUE (user_id, email_type)
);
-- Insert duplicate user_id + email_type fails
INSERT INTO user_emails (user_id, email_type, email)
VALUES (1, 'work', 'john@example.com'),
(1, 'work', 'jane@example.com');
-- Error: Duplicate entry for key 'user_id'Finding Duplicates
Group By Method
sql
-- Find duplicates by specific column
SELECT
email,
COUNT(*) AS duplicate_count
FROM users
GROUP BY email
HAVING COUNT(*) > 1;
-- Find all columns that are duplicates
SELECT
email,
name,
COUNT(*) AS duplicate_count
FROM users
GROUP BY email, name
HAVING COUNT(*) > 1;Subquery Method
sql
-- Find duplicate records
SELECT u.*
FROM users u
WHERE email IN (
SELECT email
FROM users
GROUP BY email
HAVING COUNT(*) > 1
);
-- Show one instance per duplicate
SELECT u.*
FROM users u
WHERE u.id IN (
SELECT MIN(id)
FROM users
WHERE email = u.email
);Self JOIN Method
sql
-- Find duplicate records using self-join
SELECT u1.id, u1.email, u1.name
FROM users u1
JOIN users u2 ON u1.email = u2.email AND u1.id > u2.id;
-- Count duplicates per value
SELECT
u1.email,
COUNT(*) AS duplicate_count
FROM users u1
JOIN users u2 ON u1.email = u2.email AND u1.id > u2.id
GROUP BY u1.email;Removing Duplicates
Delete Using Self JOIN
sql
-- Keep the lowest ID for each duplicate
DELETE u1
FROM users u1
JOIN users u2 ON u1.email = u2.email AND u1.id > u2.id;
-- Check deleted count
SELECT ROW_COUNT();Delete Using Subquery
sql
-- Delete duplicates, keep first occurrence
DELETE FROM users
WHERE id NOT IN (
SELECT MIN(id)
FROM users
GROUP BY email
);
-- Delete duplicates, keep highest ID
DELETE FROM users
WHERE id NOT IN (
SELECT MAX(id)
FROM users
GROUP BY email
);Delete Using Temporary Table
sql
-- Create temporary table with unique records
CREATE TEMPORARY TABLE temp_unique_users AS
SELECT
MIN(id) AS id,
email,
name
FROM users
GROUP BY email, name;
-- Truncate original table
TRUNCATE TABLE users;
-- Insert unique records
INSERT INTO users (id, email, name)
SELECT * FROM temp_unique_users;INSERT IGNORE / INSERT IGNORE
Skip Duplicate Errors
sql
-- Ignore duplicate errors
INSERT IGNORE INTO users (email, name)
VALUES ('john@example.com', 'John'),
('jane@example.com', 'Jane'),
('john@example.com', 'John Duplicate'); -- Skipped
-- Check affected rows
SELECT ROW_COUNT(); -- Shows 2 (1 skipped)REPLACE INTO / REPLACE INTO
Replace Existing Records
sql
-- Replace if duplicate key exists
REPLACE INTO users (id, email, name)
VALUES (1, 'john@example.com', 'John Updated');
-- Equivalent to DELETE + INSERT
-- If id=1 exists, delete it, then insert new recordON DUPLICATE KEY UPDATE / ON DUPLICATE KEY UPDATE
Update on Duplicate
sql
-- Update if duplicate key exists
INSERT INTO users (email, name)
VALUES ('john@example.com', 'John Updated')
ON DUPLICATE KEY UPDATE
name = VALUES(name),
updated_at = NOW();
-- Insert with multiple values
INSERT INTO users (email, name, phone)
VALUES
('john@example.com', 'John', '555-1234'),
('jane@example.com', 'Jane', '555-5678'),
('bob@example.com', 'Bob', '555-9876')
ON DUPLICATE KEY UPDATE
name = VALUES(name),
phone = VALUES(phone);Advanced ON DUPLICATE KEY UPDATE
sql
-- Increment counter on duplicate
INSERT INTO page_views (page_url, view_count)
VALUES ('/home', 1)
ON DUPLICATE KEY UPDATE
view_count = view_count + 1;
-- Conditional update
INSERT INTO users (email, name, status)
VALUES ('john@example.com', 'John', 'active')
ON DUPLICATE KEY UPDATE
status = CASE
WHEN VALUES(status) != status
THEN VALUES(status)
ELSE status
END,
updated_at = NOW();Practical Examples
Example 1: Clean Contact List
sql
-- Find duplicate emails
SELECT
email,
COUNT(*) AS dup_count,
GROUP_CONCAT(id) AS duplicate_ids
FROM contacts
GROUP BY email
HAVING COUNT(*) > 1;
-- Remove duplicates, keep most recent
DELETE c1
FROM contacts c1
JOIN contacts c2 ON c1.email = c2.email
AND c1.created_at < c2.created_at;Example 2: Merge Duplicate Records
sql
-- Merge duplicate users
CREATE TEMPORARY TABLE temp_merged_users AS
SELECT
MIN(id) AS keep_id,
GROUP_CONCAT(id) AS remove_ids,
email,
MAX(name) AS name,
MAX(phone) AS phone
FROM users
GROUP BY email
HAVING COUNT(*) > 1;
-- Create table for merged records
CREATE TEMPORARY TABLE temp_merged_final AS
SELECT * FROM temp_merged_users;
-- Insert merged records
INSERT INTO users (id, email, name, phone)
SELECT * FROM temp_merged_final;
-- Delete old duplicates
DELETE FROM users
WHERE id IN (SELECT remove_ids FROM temp_merged_users);Example 3: Import Data
sql
-- Import with duplicate handling
CREATE TEMPORARY TABLE temp_import AS
SELECT * FROM imported_data;
-- Import unique records
INSERT IGNORE INTO users (email, name)
SELECT email, name FROM temp_import;
-- Or update existing records
INSERT INTO users (email, name, phone)
SELECT email, name, phone FROM temp_import
ON DUPLICATE KEY UPDATE
name = VALUES(name),
phone = VALUES(phone);Example 4: Daily Deduplication
sql
-- Create deduplicated view
CREATE OR REPLACE VIEW v_unique_users AS
SELECT
MIN(id) AS id,
email,
MAX(name) AS name
FROM users
GROUP BY email;
-- Use view instead of table
SELECT * FROM v_unique_users;Duplicate Prevention Strategies
Database Design
sql
-- Add unique constraints
ALTER TABLE users ADD UNIQUE INDEX idx_users_email (email);
ALTER TABLE users ADD UNIQUE INDEX idx_users_phone (phone);
-- Add composite unique for business logic
ALTER TABLE orders
ADD UNIQUE INDEX idx_orders_user_product (user_id, product_id, order_date);Application Logic
sql
-- Check before insert
-- Step 1: Check exists
SELECT COUNT(*) INTO @exists
FROM users
WHERE email = 'john@example.com';
-- Step 2: Insert if not exists
IF @exists = 0 THEN
INSERT INTO users (email, name)
VALUES ('john@example.com', 'John');
END IF;Triggers for Prevention
sql
-- Prevent duplicate inserts
DELIMITER //
CREATE TRIGGER before_user_insert
BEFORE INSERT ON users
FOR EACH ROW
BEGIN
DECLARE dup_count INT;
SELECT COUNT(*) INTO dup_count
FROM users
WHERE email = NEW.email;
IF dup_count > 0 THEN
SIGNAL SQLSTATE '45000'
SET MESSAGE_TEXT = 'Duplicate email not allowed';
END IF;
END //
DELIMITER ;Monitoring Duplicates
Duplicate Detection Query
sql
-- Find duplicates across all tables
SELECT
TABLE_NAME,
COLUMN_NAME,
COUNT(*) AS duplicate_count
FROM information_schema.COLUMNS c
JOIN (
SELECT email, COUNT(*) AS dup_cnt
FROM users
GROUP BY email
HAVING COUNT(*) > 1
) d ON d.email = c.COLUMN_NAME;Regular Duplicate Check
sql
-- Create procedure to check duplicates
DELIMITER //
CREATE PROCEDURE check_duplicates()
BEGIN
SELECT
'users' AS table_name,
email AS duplicate_field,
COUNT(*) AS count
FROM users
GROUP BY email
HAVING COUNT(*) > 1;
SELECT
'orders' AS table_name,
order_number AS duplicate_field,
COUNT(*) AS count
FROM orders
GROUP BY order_number
HAVING COUNT(*) > 1;
END //
DELIMITER ;
-- Run regularly
CALL check_duplicates();Best Practices
Choosing Deduplication Method
sql
-- Use INSERT IGNORE for:
-- - Simple insert operations
-- - Want to skip duplicates silently
-- Use REPLACE for:
-- - Complete row replacement
-- - Need to delete old record completely
-- Use ON DUPLICATE KEY UPDATE for:
-- - Update existing records
-- - Merge data
-- - Keep history with update timestampPerformance Considerations
sql
-- Large deduplication operations
-- Use temporary tables for better performance
CREATE TEMPORARY TABLE temp_dedup AS
SELECT * FROM large_table;
-- Add indexes to temporary table
CREATE INDEX idx_temp_dedup_email ON temp_dedup(email);
-- Delete in batches
DELETE FROM temp_dedup
WHERE id IN (
SELECT MIN(id)
FROM temp_dedup
GROUP BY email
HAVING COUNT(*) > 1
LIMIT 10000
);Summary
Handling duplicates in MySQL involves:
- Prevention: PRIMARY KEY, UNIQUE constraints
- Detection: GROUP BY, self-joins, subqueries
- Removal: DELETE with joins, temporary tables
- Insert Handling: INSERT IGNORE, REPLACE, ON DUPLICATE KEY UPDATE
- Monitoring: Regular checks for data quality
Choose appropriate method based on your data integrity requirements and performance needs.
Previous: Sequences
Next: SQL Injection