Friday, March 9, 2012

Problem Deleting Duplicate Data

I have a table that contains more than 10,000 rows of

duplicate data. The script below copies the data to a temp table then

deletes from the original table. My problem is that after it runs, I now

have 122 rows of triplicate data (but dups are gone). If I rerun the script, it doesn't see the

triplicate data and returns 0 rows. I've use three different versions of

delete dup row scripts with the same result. There are no triggers or

constraints on the table, not even a primary key. What am I missing?

-

/**********************************************
Delete Duplicate Data
**********************************************/

--Create temp table to hold duplicate data
CREATE TABLE #tempduplicatedata
(
[student_test_uniq] [bigint] NULL,
[test_uniq] [int] NULL,
[concept_id] [smallint] NULL,
[test_id] [varchar](12) NULL,
[questions_correct] [smallint] NULL,
[questions_count] [smallint] NULL,
[percentage_correct] [decimal](6, 3) NULL,
[concept_response_count] [smallint] NULL
)

--Identify and save dup data into temp table
INSERT INTO #tempduplicatedata
SELECT * FROM crt_concept_score
GROUP BY student_test_uniq,
test_uniq,
concept_id,
test_id,
questions_correct,
questions_count,
percentage_correct,
concept_response_count
HAVING COUNT(*) > 1

--Confirm number of dup rows
SELECT @.@.ROWCOUNT AS 'Number of Duplicate Rows'

--Delete dup from original table
DELETE FROM crt_concept_score
FROM crt_concept_score
INNER JOIN #tempduplicatedata
ON crt_concept_score.student_test_uniq = #tempduplicatedata.student_test_uniq
AND crt_concept_score.test_uniq = #tempduplicatedata.test_uniq
AND crt_concept_score.concept_id = #tempduplicatedata.concept_id
AND crt_concept_score.test_id = #tempduplicatedata.test_id
AND crt_concept_score.questions_correct = #tempduplicatedata.questions_correct
AND crt_concept_score.questions_count = #tempduplicatedata.questions_count
AND crt_concept_score.percentage_correct = #tempduplicatedata.percentage_correct
AND crt_concept_score.concept_response_count = #tempduplicatedata.concept_response_count

--Insert the delete data back
INSERT INTO crt_concept_score
SELECT * FROM #tempduplicatedata

--Check for dup data.
SELECT * FROM crt_concept_score
GROUP BY student_test_uniq,
test_uniq,
concept_id,
test_id,
questions_correct,
questions_count,
percentage_correct,
concept_response_count
HAVING COUNT(*) > 1

--Check table
-- SELECT * FROM crt_concept_score

--Drop temp table
DROP TABLE #tempduplicatedata
GO

i've seen something like this in one of Kat's post

here's the link

http://forums.microsoft.com/MSDN/ShowPost.aspx?PostID=746636&SiteID=1

No comments:

Post a Comment