Friday, August 17, 2012

Dedupe based on compound key and timestamp

DECLARE @sourcetab TABLE ( id1 int, id2 int, update_on smalldatetime, val varchar(50) )

DECLARE @targettab TABLE ( id1 int, id2 int, update_on smalldatetime, val varchar(50) )


insert into @sourcetab (id1, id2, update_on, val)
      SELECT 10, 1, '20120110', 'testrec1'
UNION SELECT 10, 1, '20120201', 'testrec2'
UNION SELECT 5, 2, '20120201', 'testrec3'
UNION SELECT 5, 1, '20120201', 'testrec4'
UNION SELECT 5, 1, '20120205', 'testrec5'
UNION SELECT 12, 18, '20120201', 'testrec6'
UNION SELECT 12, 18, '20120205', 'testre7'
UNION SELECT 12, 5, '20120201', 'testrec8'
UNION SELECT 17, 3, '20120201', 'testrec9'
UNION SELECT 18, 4, '20120201', 'testrec10'


insert into @targettab (id1, id2, update_on, val)
      SELECT 10, 1, '20120101', 'testrec01'
UNION SELECT 12, 5, '20120101', 'testrec02'
UNION SELECT 20, 19, '20120101', 'testrec03'


--not deduped
          SELECT A.id1, A.id2, A.update_on, A.val  
            FROM @sourcetab A
        ORDER BY A.id1, A.id2, A.update_on


--deduped with most current wins
          SELECT A.id1, A.id2, A.update_on, A.val 
            FROM @sourcetab A
           WHERE A.update_on = ( SELECT max(A1.update_on)
                                   FROM @sourcetab A1
                                  WHERE A1.id1 = A.id1
                                    AND A1.id2 = A.id2 )


--update with most recent
          UPDATE A
             SET A.val = B.val
               , A.update_on = B.update_on
            FROM @targettab A
      INNER JOIN @sourcetab B
              ON A.id1 = B.id1 
             AND A.id2 = B.id2
           WHERE B.update_on > A.update_on
             AND B.update_on = ( SELECT max(B1.update_on)
                                   FROM @sourcetab B1
                                  WHERE B1.id1 = B.id1
                                    AND B1.id2 = B.id2 )



--insert new recs

     INSERT INTO @targettab ( id1, id2, update_on, val )
          SELECT A.id1, A.id2, A.update_on, A.val 
            FROM @sourcetab A
           WHERE A.update_on = ( SELECT max(A1.update_on)
                                   FROM @sourcetab A1
                                  WHERE A1.id1 = A.id1
                                    AND A1.id2 = A.id2 )
             AND NOT EXISTS ( SELECT TOP 1 1 
                                FROM @targettab B
                               WHERE B.id1 = A.id1
                                 AND B.id2 = A.id2 )


--final result
          SELECT A.* 
            FROM @targettab A
        ORDER BY A.id1, A.id2, A.update_on



No comments:

Post a Comment