我研究过类似的问题,但从未找到过一次对数据进行一次传递的窗口函数解决方案。我认为这是不可能的。窗口函数必须能够应用于列中的所有值。这样会使复位计算非常困难,因为一次复位会更改以下所有值的值。
考虑问题的一种方法是,只要您可以从正确的上一行中减去运行总计,就可以计算出基本运行总计来获得所需的最终结果。例如,在样本数据中,id
4的值为running total of row 4 - the running total of row 3
。id
6 的值是,running total of row 6 - the running total of row 3
因为尚未发生重置。id
7 的值是running total of row 7 - the running total of row 6
,依此类推。
我会在一个循环中使用T-SQL来解决这个问题。我有些不高兴,认为我有完整的解决方案。对于300万行和500组,代码在我的桌面上在24秒内完成。我正在测试具有6个vCPU的SQL Server 2016 Developer版本。我通常利用并行插入和并行执行的优势,因此如果您使用的是旧版本或具有DOP限制,则可能需要更改代码。
在我用来生成数据的代码下面。在范围VAL
和RESET_VAL
应类似于您的样本数据。
drop table if exists reset_runn_total;
create table reset_runn_total
(
id int identity(1,1),
val int,
reset_val int,
grp int
);
DECLARE
@group_num INT,
@row_num INT;
BEGIN
SET NOCOUNT ON;
BEGIN TRANSACTION;
SET @group_num = 1;
WHILE @group_num <= 50000
BEGIN
SET @row_num = 1;
WHILE @row_num <= 60
BEGIN
INSERT INTO reset_runn_total WITH (TABLOCK)
SELECT 1 + ABS(CHECKSUM(NewId())) % 10, 8 + ABS(CHECKSUM(NewId())) % 8, @group_num;
SET @row_num = @row_num + 1;
END;
SET @group_num = @group_num + 1;
END;
COMMIT TRANSACTION;
END;
算法如下:
1)首先将所有具有标准运行总计的行插入到临时表中。
2)在循环中:
2a)对于每个组,在表中剩余的reset_value之上计算运行总计的第一行,并在临时表中存储id,运行总计太大和前一个运行总计太大的行。
2b)将第一个临时表中的行删除到ID
小于或等于ID
第二个临时表中的结果临时表中。使用其他列根据需要调整运行总计。
3)删除不再处理后,DELETE OUTPUT
在结果表中再运行一行。这是针对组末尾永不超过重置值的行。
我将逐步在T-SQL中完成上述算法的一种实现。
首先创建一些临时表。#initial_results
包含具有标准运行总计的原始数据,在#group_bookkeeping
每个循环中进行更新以找出可以移动的行,并#final_results
包含针对重置进行了调整的运行总计的结果。
CREATE TABLE #initial_results (
id int,
val int,
reset_val int,
grp int,
initial_running_total int
);
CREATE TABLE #group_bookkeeping (
grp int,
max_id_to_move int,
running_total_to_subtract_this_loop int,
running_total_to_subtract_next_loop int,
grp_done bit,
PRIMARY KEY (grp)
);
CREATE TABLE #final_results (
id int,
val int,
reset_val int,
grp int,
running_total int
);
INSERT INTO #initial_results WITH (TABLOCK)
SELECT ID, VAL, RESET_VAL, GRP, SUM(VAL) OVER (PARTITION BY GRP ORDER BY ID) RUNNING_TOTAL
FROM reset_runn_total;
CREATE CLUSTERED INDEX i1 ON #initial_results (grp, id);
INSERT INTO #group_bookkeeping WITH (TABLOCK)
SELECT DISTINCT GRP, 0, 0, 0, 0
FROM reset_runn_total;
之后,我在临时表上创建了聚集索引,因此插入和索引构建可以并行完成。对我的机器影响很大,但对您的机器可能没有影响。在源表上创建索引似乎没有帮助,但这可能对您的计算机有所帮助。
下面的代码在循环中运行,并更新簿记表。对于每个组,我们需要找到最大值ID
,该最大值应移到结果表中。我们需要该行的运行总计,以便可以从初始运行总计中减去它。grp_done
当不再需要做任何工作时,该列设置为1 grp
。
WITH UPD_CTE AS (
SELECT
#grp_bookkeeping.GRP
, MIN(CASE WHEN initial_running_total - #group_bookkeeping.running_total_to_subtract_next_loop > RESET_VAL THEN ID ELSE NULL END) max_id_to_update
, MIN(#group_bookkeeping.running_total_to_subtract_next_loop) running_total_to_subtract_this_loop
, MIN(CASE WHEN initial_running_total - #group_bookkeeping.running_total_to_subtract_next_loop > RESET_VAL THEN initial_running_total ELSE NULL END) additional_value_next_loop
, CASE WHEN MIN(CASE WHEN initial_running_total - #group_bookkeeping.running_total_to_subtract_next_loop > RESET_VAL THEN ID ELSE NULL END) IS NULL THEN 1 ELSE 0 END grp_done
FROM #group_bookkeeping
INNER JOIN #initial_results IR ON #group_bookkeeping.grp = ir.grp
WHERE #group_bookkeeping.grp_done = 0
GROUP BY #group_bookkeeping.GRP
)
UPDATE #group_bookkeeping
SET #group_bookkeeping.max_id_to_move = uv.max_id_to_update
, #group_bookkeeping.running_total_to_subtract_this_loop = uv.running_total_to_subtract_this_loop
, #group_bookkeeping.running_total_to_subtract_next_loop = uv.additional_value_next_loop
, #group_bookkeeping.grp_done = uv.grp_done
FROM UPD_CTE uv
WHERE uv.GRP = #group_bookkeeping.grp
OPTION (LOOP JOIN);
LOOP JOIN
通常,确实不是该提示的支持者,但这是一个简单的查询,并且是获得我想要的最快的方法。为了真正优化响应时间,我希望使用并行嵌套循环联接,而不是DOP 1合并联接。
下面的代码在循环中运行,并将数据从初始表移至最终结果表。注意对初始运行总计的调整。
DELETE ir
OUTPUT DELETED.id,
DELETED.VAL,
DELETED.RESET_VAL,
DELETED.GRP ,
DELETED.initial_running_total - tb.running_total_to_subtract_this_loop
INTO #final_results
FROM #initial_results ir
INNER JOIN #group_bookkeeping tb ON ir.GRP = tb.GRP AND ir.ID <= tb.max_id_to_move
WHERE tb.grp_done = 0;
为了您的方便,下面是完整的代码:
DECLARE @RC INT;
BEGIN
SET NOCOUNT ON;
CREATE TABLE #initial_results (
id int,
val int,
reset_val int,
grp int,
initial_running_total int
);
CREATE TABLE #group_bookkeeping (
grp int,
max_id_to_move int,
running_total_to_subtract_this_loop int,
running_total_to_subtract_next_loop int,
grp_done bit,
PRIMARY KEY (grp)
);
CREATE TABLE #final_results (
id int,
val int,
reset_val int,
grp int,
running_total int
);
INSERT INTO #initial_results WITH (TABLOCK)
SELECT ID, VAL, RESET_VAL, GRP, SUM(VAL) OVER (PARTITION BY GRP ORDER BY ID) RUNNING_TOTAL
FROM reset_runn_total;
CREATE CLUSTERED INDEX i1 ON #initial_results (grp, id);
INSERT INTO #group_bookkeeping WITH (TABLOCK)
SELECT DISTINCT GRP, 0, 0, 0, 0
FROM reset_runn_total;
SET @RC = 1;
WHILE @RC > 0
BEGIN
WITH UPD_CTE AS (
SELECT
#group_bookkeeping.GRP
, MIN(CASE WHEN initial_running_total - #group_bookkeeping.running_total_to_subtract_next_loop > RESET_VAL THEN ID ELSE NULL END) max_id_to_move
, MIN(#group_bookkeeping.running_total_to_subtract_next_loop) running_total_to_subtract_this_loop
, MIN(CASE WHEN initial_running_total - #group_bookkeeping.running_total_to_subtract_next_loop > RESET_VAL THEN initial_running_total ELSE NULL END) additional_value_next_loop
, CASE WHEN MIN(CASE WHEN initial_running_total - #group_bookkeeping.running_total_to_subtract_next_loop > RESET_VAL THEN ID ELSE NULL END) IS NULL THEN 1 ELSE 0 END grp_done
FROM #group_bookkeeping
CROSS APPLY (SELECT ID, RESET_VAL, initial_running_total FROM #initial_results ir WHERE #group_bookkeeping.grp = ir.grp ) ir
WHERE #group_bookkeeping.grp_done = 0
GROUP BY #group_bookkeeping.GRP
)
UPDATE #group_bookkeeping
SET #group_bookkeeping.max_id_to_move = uv.max_id_to_move
, #group_bookkeeping.running_total_to_subtract_this_loop = uv.running_total_to_subtract_this_loop
, #group_bookkeeping.running_total_to_subtract_next_loop = uv.additional_value_next_loop
, #group_bookkeeping.grp_done = uv.grp_done
FROM UPD_CTE uv
WHERE uv.GRP = #group_bookkeeping.grp
OPTION (LOOP JOIN);
DELETE ir
OUTPUT DELETED.id,
DELETED.VAL,
DELETED.RESET_VAL,
DELETED.GRP ,
DELETED.initial_running_total - tb.running_total_to_subtract_this_loop
INTO #final_results
FROM #initial_results ir
INNER JOIN #group_bookkeeping tb ON ir.GRP = tb.GRP AND ir.ID <= tb.max_id_to_move
WHERE tb.grp_done = 0;
SET @RC = @@ROWCOUNT;
END;
DELETE ir
OUTPUT DELETED.id,
DELETED.VAL,
DELETED.RESET_VAL,
DELETED.GRP ,
DELETED.initial_running_total - tb.running_total_to_subtract_this_loop
INTO #final_results
FROM #initial_results ir
INNER JOIN #group_bookkeeping tb ON ir.GRP = tb.GRP;
CREATE CLUSTERED INDEX f1 ON #final_results (grp, id);
/* -- do something with the data
SELECT *
FROM #final_results
ORDER BY grp, id;
*/
DROP TABLE #final_results;
DROP TABLE #initial_results;
DROP TABLE #group_bookkeeping;
END;