SQL Server 2008 R2: Delete duplicate rows from tab

2019-02-21 04:02发布

问题:

--Creating Table dup1

CREATE TABLE dup1
(
    cola VARCHAR(10),
    colb VARCHAR(10)
);

--Insertion of records

INSERT INTO dup1 VALUES('1','2');
INSERT INTO dup1 VALUES('1','2');
INSERT INTO dup1 VALUES('1','3');
INSERT INTO dup1 VALUES('1','4');
INSERT INTO dup1 VALUES('1','5');

--Creating Table dup2

CREATE TABLE dup2
(
    cola VARCHAR(10),
    colb VARCHAR(10)
);

--Insertion of records

INSERT INTO dup2 VALUES('1','2');
INSERT INTO dup2 VALUES('1','2');
INSERT INTO dup2 VALUES('2','3');
INSERT INTO dup2 VALUES('2','4');
INSERT INTO dup2 VALUES('1','5');

--Creating View

CREATE VIEW V_Dup as
SELECT * FROM dup1 UNION ALL 
SELECT * FROM dup2;

Note: The records 1,2 and 1,5 is duplicate in both tables BUT want to keep the original data.

My question is how do I delete the duplicate records from both tables.

回答1:

create temp. table :

DECLARE @tempDuplicateTable AS TABLE(
    cola VARCHAR(10),
    colb VARCHAR(10)
)

insert row duplicate :

INSERT INTO @tempDuplicateTable
    ( cola, colb )
    (
        SELECT a.cola, a.colb FROM dup1 a
        INNER JOIN dup2 b ON b.cola = a.cola AND b.colb = a.colb
    )

delete duplicate data from both table dup1 and dup2 :

DELETE a FROM dup1 a INNER JOIN @tempDuplicateTable b ON b.cola = a.cola AND b.colb = a.colb
DELETE a FROM dup2 a INNER JOIN @tempDuplicateTable b ON b.cola = a.cola AND b.colb = a.colb

if you just want this result :

 cola   colb
   1    2
   1    3
   1    4
   1    5
   2    3
   2    4

try this query :

SELECT DISTINCT * FROM V_Dup 

or you can modify your View like this :

CREATE VIEW V_Dup as
    SELECT DISTINCT a.* FROM (
    SELECT * FROM dup1 UNION ALL 
    SELECT * FROM dup2
) a


回答2:

Create a temp table with the rows need to be deleted from CTE and delete those from both the tables.

Query

CREATE TABLE dup1
(
    cola VARCHAR(10),
    colb VARCHAR(10)
);

INSERT INTO dup1 VALUES('1','2');
INSERT INTO dup1 VALUES('1','2');
INSERT INTO dup1 VALUES('1','3');
INSERT INTO dup1 VALUES('1','4');
INSERT INTO dup1 VALUES('1','5');

CREATE TABLE dup2
(
    cola VARCHAR(10),
    colb VARCHAR(10)
);

INSERT INTO dup2 VALUES('1','2');
INSERT INTO dup2 VALUES('1','2');
INSERT INTO dup2 VALUES('2','3');
INSERT INTO dup2 VALUES('2','4');
INSERT INTO dup2 VALUES('1','5');

CREATE VIEW V_Dup as
SELECT * FROM dup1 UNION ALL
SELECT * FROM dup2;

;with cte as 
(
    select rn=row_number() over
    (
        partition by cola,colb
        order by cola,colb
    ),*
    from V_Dup
)
select * into #temp
from cte 
where rn>1;

delete t1 from dup1 t1
inner join  #temp t2
on t1.cola = t2.cola
and t1.colb = t2.colb;

delete t1 from dup2 t1
inner join  #temp t2
on t1.cola = t2.cola
and t1.colb = t2.colb;

drop table #temp;


回答3:

I guess you want just to see record once in your view because you say you want to keep original data. So you should use UNION instead of UNION ALL in your view

CREATE VIEW V_Dup as
SELECT * FROM dup1 UNION  
SELECT * FROM dup2;

Otherwise if you want to delete all the duplicated rows from table generating view you will have to do something like this:

;WITH DUP_CTE AS
(
SELECT  cola, colb,ROW_NUMBER() OVER (PARTITION BY cola,colb ORDER BY (SELECT 0)) RN FROM V_Dup 
)
DELETE FROM DUP_CTE

WHERE EXISTS( SELECT 0 FROM dup_cte AS c WHERE c.cola=dup_cte.cola AND c.colb=dup_cte.colb AND RN <> 1);

If you want to delete only duplicates:

;WITH DUP_CTE AS
    (
    SELECT  cola, colb,ROW_NUMBER() OVER (PARTITION BY cola,colb ORDER BY (SELECT 0)) RN FROM V_Dup 
    )
    DELETE FROM DUP_CTE
    WHERE  RN <> 1;

But you will need a primary key in your tables for the last two solutionz.



回答4:

Are dup1 and dup2 partitionable table? I mean if you can add a column to them that allows to understand wich table to update. e.g. date so that if you insert a date in an interval you update dup1 else dup2. If you don't have a partitioning column you could create one simply adding a column able to identify the table (e.g. a varchar(1) with value '1' or an int having value 1 for dup1 and 2 for dup2) . This column should be a part of the primary key (here I created an ID). The table could look like this:

CREATE TABLE dbo.dup1
    (
    cola   VARCHAR (10),
    colb   VARCHAR (10),
    ID     INT IDENTITY NOT NULL,
    partit NCHAR (10) CONSTRAINT DF_dup1_partit DEFAULT ('1') NOT NULL CONSTRAINT CK_dup1 CHECK ([PARTIT]='1'),
    CONSTRAINT PK_dup1 PRIMARY KEY (ID, partit)
    )

CREATE TABLE dbo.dup2
    (
    cola   VARCHAR (10),
    colb   VARCHAR (10),
    ID     INT IDENTITY NOT NULL,
    partit NCHAR (10) CONSTRAINT DF_dup2_partit DEFAULT ('2') NOT NULL CONSTRAINT CK_dup2 CHECK ([PARTIT]='2'),
    CONSTRAINT PK_dup2 PRIMARY KEY (ID, partit)
    )

So that the view will be:

CREATE VIEW V_Dup as
SELECT * FROM dup1 UNION ALL 
SELECT * FROM dup2
WITH CHECK OPTION

this way you will be able to use the code I posted before