如何合并具有不同有效日期的类似记录?


10

我正在处理的表包含三个部分:

  1. 一个ID柱(在另一个表中的主键)
  2. 一些数据列
  3. 日期有效from/ to列。

值:

ID   Data From        To  
1    a    2015-01-01  2015-01-05
1    a    2015-01-06  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-05
2    c    2015-01-06  2015-01-10

通过以一定间隔获取另一个数据源的“快照”并将有效日期分配给记录来更新表。问题在于这些快照为记录(具有不同的有效日期)创建了重复的条目,这些记录在该时间间隔内根本没有更改。

我想通过查找具有连续日期的行并合并它们并为它们分配一个有效期来减小表的大小。例如:

ID   Data From        To  
1    a    2015-01-01  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-10

我目前拥有的逻辑是:

  1. 选择所有行并按ID,数据字段和“有效自”字段排序(因此它们在连续行的组中)。
  2. 使用光标比较相邻行的相似性。
  3. 如果它们相同,则合并行并更改有效期以包括两行。

我知道游标的效率非常低(我有一个很大的数据集),所以我正在寻找其他方法。


2
另外:CREATE TABLE在问题中添加语句。
ypercubeᵀᴹ

2
“大型”数据集有多大?为什么不修复快照导入,这样一来就不会造成问题?
保罗·怀特9

数以百万计的记录。我没有修改表创建方式的权限。另外,这不能解决过去的记录问题。
hazrmard

Answers:


8

如果这仅是一个背对背范围的表,则您的情况可以视为经典的“间隙和孤岛”问题,您只需要隔离连续范围的孤岛,然后通过取最小范围来“压缩”它们即可。 [from][to]每个岛的最大值。

有一个使用两个ROW_NUMBER调用来解决此问题的既定方法:

WITH islands AS
(
  SELECT
    id,
    data,
    [from],
    [to],
    island = ROW_NUMBER() OVER (PARTITION BY id       ORDER BY [from])
           - ROW_NUMBER() OVER (PARTITION BY id, data ORDER BY [from])
  FROM
    #mergeTest
)
SELECT
  id,
  data,
  [from] = MIN([from]),
  [to]   = MAX([to])
FROM
  islands
GROUP BY
  id,
  data,
  island
;

此查询将在与SQL Server 2005一样低的版本中工作。


1

我能够编写一个查询来解决此问题。它使用多个联接和一个while循环来合并记录。此代码与SQL Server 2008 R2兼容。

CREATE TABLE #mergeTest
(
    [id] int NOT NULL,
    [data] date,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO #mergeTest ([id],[data],[from],[to]) VALUES     --testing null data value handling
    (1,NULL,'2015-01-01','2015-01-05'), --1
    (1,NULL,'2015-01-05','2015-01-10'), --2
    (1,'2000-01-01','2015-01-10','2015-01-14'), --3
    (1,'2000-01-03','2015-01-14','2015-01-15'), --4
    (1,'2000-01-01','2015-01-15','2015-01-20'), --5
    (1,'2000-01-01','2015-01-20','2015-01-22'), --5
    (1,'2000-01-01','2015-01-22','2015-01-25'), --6
    (1,'2000-01-01','2015-01-25','2015-01-30'), --7
    (1,NULL,'2015-01-30','2015-02-04'), --8
    (2,'2000-01-05','2015-01-01','2015-01-05'), --9
    (2,'2000-01-05','2015-01-05','2015-01-10')  --10

SELECT * FROM #mergeTest 
GO
;

SELECT * INTO #tempSingle                               --isolate single records. Single records need no processing.
    FROM (
        SELECT  [id], [data], MIN([from]) as [from], MIN([to]) as [to],
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]=1;
ALTER TABLE #tempSingle
    DROP COLUMN [grpsz];
GO
;

SELECT * INTO #tempRemainingtemp                        --isolate records w/ more than 2 entries. They need to be reduced to single records
    FROM (
        SELECT  [id], [data],                           --get [id] and [data] of duplicate records
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]>=2;
ALTER TABLE #tempRemainingTemp
    DROP COLUMN [grpsz]
SELECT * FROM #tempRemainingtemp
SELECT * INTO #temp                                     --get all duplicate records into #temp
    FROM (
        SELECT [b].*
        FROM #tempRemainingtemp AS [a]
        JOIN #mergeTest AS [b]
        ON      [a].[id]=[b].[id]
            AND ([a].[data]=[b].[data] OR [a].[data] IS NULL AND [b].[data] IS NULL)) AS [selection];

DROP TABLE #tempRemainingtemp;
Go
SELECT * INTO #tempRemaining
    FROM #temp;
DROP TABLE #temp;
GO
;
SELECT * FROM #tempRemaining
BEGIN
SELECT t1.*, t2.[from] as [prevfrom] INTO #temp0        --filter in records where previous 'to' date matched current 'from' date when grouped by id and data
    FROM #tempRemaining AS t1
    JOIN #tempRemaining AS t2
    ON      t2.[to] = t1.[from]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)

SELECT t1.*, t2.[prevfrom] INTO #temp1                  --add records that did not have a previous 'to' date b/c they were the extreme records in their group
    FROM #tempRemaining AS t1
    LEFT JOIN #temp0 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp0;

SELECT t1.*, t2.[to] as [nextto] INTO #temp2            --filter in records where current 'to' date matched next 'from' date when grouped by id and data
    FROM #temp1 AS t1
    JOIN #temp1 AS t2
    ON      t2.[from] = t1.[to]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL);

SELECT t1.*, t2.[nextto] INTO #temp                     --add records that did not have a next 'from' date b/c they were the extreme records in their group
    FROM #temp1 AS t1
    LEFT JOIN #temp2 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp2;
DROP TABLE #temp1;

DELETE FROM #temp                                       --delete redundant records
    WHERE   [prevfrom] IS NOT NULL
        AND [nextto] IS NOT NULL;

WITH cte AS (                                           --select records that got reduced to singles and insert them into singles account
    SELECT [id], [data], [from], [to]
        FROM [#temp]
        WHERE   [prevfrom] IS NULL
            AND [nextto] IS NULL)
DELETE FROM cte
OUTPUT deleted.* INTO #tempSingle

/* ALL DUPLICATE RECORDS ARE NOW REDUCED TO PAIRS*/

SELECT * FROM #temp;
ALTER TABLE #temp
    DROP COLUMN [nextto],[prevfrom]                     --remove helper columns
END

SELECT TOP 1 * INTO #temptemp                           --create temporary tables for storage
    FROM #temp
SELECT TOP 1 * INTO #tempResult
    FROM #temp
TRUNCATE TABLE #temptemp
TRUNCATE TABLE #tempResult

WHILE EXISTS(SELECT [id] from #temp)
BEGIN
    WITH cte AS (
            SELECT TOP 2 *                              --select pair
                FROM #temp
                ORDER BY [id],[data],[from])
        DELETE FROM cte                                 --delete from original table
        OUTPUT deleted.* INTO #temptemp;
    INSERT INTO #tempResult                             --insert merged record into result table
        SELECT t1.[id], t1.[data], t1.[from], t2.[to]
        FROM #temptemp AS t1
        JOIN #temptemp AS t2
        ON t1.[from]<t2.[from];
    TRUNCATE TABLE #temptemp;                           --empty temporary storage table
END;

TRUNCATE TABLE #mergeTest;                              --insert single records and merged records into original table
INSERT INTO #mergeTest
    SELECT * FROM #tempResult;
INSERT INTO #mergeTest
    SELECT * FROM #tempSingle;

SELECT * FROM #mergeTest
    ORDER BY [id],[from];

0

就您有不连续的日期范围(即使连续的日期范围也必须分开)而言,我想出了以下解决方案:

在SQL小提琴上看到

WITH lag_info AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    lag([To], 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevTo,
    lag(Data, 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevData
  FROM dat
),
segmented AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    -- new interval if non-contigous or data changed
    -- if it's null, it means that it's the first entry for the ID, which means it's a new interval
    CASE
      WHEN [PrevTo] IS NULL
        OR PrevData IS NULL
        OR DATEDIFF(DAY, [PrevTo], [From]) > 1
        OR Data <> PrevData
      THEN 1
      ELSE 0
    END AS is_new_interval
  FROM lag_info
),
segmented_marked AS (
  SELECT
    ID,
    [From],
    [To],
    Data,
    -- increment only when new data is detected, using a running sum
    sum(s.is_new_interval)
      OVER (PARTITION BY ID ORDER BY [From] ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING)
                                AS interval_id
  FROM segmented s
)
SELECT
  ID,
  min([From]) AS [From],
  max([To]) AS [To],
  Data
FROM segmented_marked
GROUP BY ID, Data, interval_id

-1

我写了一个似乎有效的查询。它使用通用表表达式,MERGE语句和解析函数。但是,它仅与SQL Server 2012+兼容。您可以在这里找到要点:MergeRecordsByValidityDate.sql

/*  NOTE: Only works w/ SQL Server 2012+
    Merging identical records with different validity dates.
*/
USE [master]


IF OBJECT_ID('mergeTest') IS NOT NULL
    DROP TABLE mergeTest

CREATE TABLE mergeTest          -- Create table with test data
(
    [id] int NOT NULL,
    [data] char(1) NOT NULL,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO mergeTest ([id],[data],[from],[to]) VALUES      -- Insert records w/ different validity dates
    (1,'a','2015-01-01','2015-01-05'),  --1
    (1,'a','2015-01-05','2015-01-10'),  --2
    (1,'a','2015-01-10','2015-01-14'),  --3
    (1,'b','2015-01-14','2015-01-15'),  --4
    (1,'a','2015-01-15','2015-01-20'),  --5
    (1,'a','2015-01-20','2015-01-25'),  --6
    (1,'a','2015-01-25','2015-01-30'),  --7
    (1,'a','2015-01-30','2015-02-04'),  --8
    (2,'c','2015-01-01','2015-01-05'),  --9
    (2,'c','2015-01-05','2015-01-10')   --10

SELECT * FROM mergeTest

/*  This SELECT function uses a Common Table Expression along with Analytic functions over a partition.
    The data set is partitioned on similar primary key and data columns and ordered by 'from' dates.
    A 'last' and 'next' column is added with 'to' date of prev row and 'from' date of next row.
    For each partition, rows are selected (for each partition) that represent the first and last records 
    of identical data. For e.g. rows 5,6,7,8 are reduced to 5,8.
*/

;WITH partitionedData AS (
    SELECT *,   LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM mergeTest)
SELECT [id],[data],[from],[to],[last],[next] INTO #temp
    FROM partitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temp

/*  Now all redundant 'sandwiched' records have been filtered out, only the extreme records are left.
    This MERGE function matches rows on primary key and data, and If the 'to' date of said record matches
    'from' date of another similar record, then the said record is extended to encapsulate the other record's
    'to' date. For example row 5's 'to' date is extended to equal row 8's 'to' date.
*/

MERGE INTO #temp as m1
    USING #temp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[to]=m2.[from])
    THEN
    UPDATE SET  m1.[to]=m2.[to]
;

SELECT * FROM #temp

/*  The MERGE function has done its job of extending records. However there are still 2 records with
    identical data. For e.g. rows 9,10 exist even though row 9 now has all the required information. This 
    block modifies such redundant rows so their 'last' and 'from' columns become asynchronous.
*/

;WITH repartitionedData AS (
    SELECT [id],[data],[from],[to], LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM #temp)
SELECT [id],[data],[from],[to],[last],[next] INTO #temptemp
    FROM repartitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temptemp

/* Asynchronous rows are deleted
*/

DELETE FROM #temptemp
    WHERE [from]<[last]

SELECT * FROM #temptemp

/*  However, blocks of data with >2 rows (like rows 5 through 8) could not be merged because of the filtered out
    rows (i.e. rows 6,7). Applying MERGE again on the updated data set.
*/

MERGE INTO #temptemp as m1
    USING #temptemp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[from]=m2.[next])
    THEN
    UPDATE SET  m1.[from]=m2.[from],
                m1.[last]=CASE WHEN ((m2.[last] IS NULL) OR (m2.[next] IS NULL)) THEN NULL ELSE m1.[last] END   --if row absorbing from is extreme, then current row is also extreme
;

SELECT * FROM #temptemp

TRUNCATE TABLE mergeTest        -- resetting original table

/* The MERGE corrected all rows with the correct 'from' and 'to' dates. And the only rows we are interested in are
    the extreme rows i.e. with 'last' or 'next' == NULL. SELECTing on that criterion and INSERTing into original table.
*/

INSERT INTO mergeTest           -- inserting processed records into table + some last minute filtering
    SELECT [id],[data],[from],MAX([to])
    FROM #temptemp
        WHERE [next] IS NULL OR [last] IS NULL
    GROUP BY [id],[data],[from]

SELECT * FROM mergeTest

DROP TABLE #temp
DROP TABLE #temptemp
By using our site, you acknowledge that you have read and understand our Cookie Policy and Privacy Policy.
Licensed under cc by-sa 3.0 with attribution required.