Creating groups of consecutive days meeting a give

2020-03-01 19:32发布

问题:

I have table the following data structure in SQL Server:

ID  Date        Allocation
 1, 2012-01-01, 0
 2, 2012-01-02, 2
 3, 2012-01-03, 0
 4, 2012-01-04, 0
 5, 2012-01-05, 0
 6, 2012-01-06, 5

etc.

What I need to do is get all consecutive day periods where Allocation = 0, and in the following form:

Start Date    End Date     DayCount
2012-01-01    2012-01-01   1
2012-01-03    2012-01-05   3

etc.

Is it possible to do this in SQL, and if so how?

回答1:

In this answer, I'll assume that the "id" field numbers the rows consecutively when sorted by increasing date, like it does in the example data. (Such a column can be created if it does not exist).

This is an example of a technique described here and here.

1) Join the table to itself on adjacent "id" values. This pairs adjacent rows. Select rows where the "allocation" field has changed. Store the result in a temporary table, also keeping a running index.

SET @idx = 0;
CREATE TEMPORARY TABLE boundaries
SELECT
   (@idx := @idx + 1) AS idx,
   a1.date AS prev_end,
   a2.date AS next_start,
   a1.allocation as allocation
FROM allocations a1
JOIN allocations a2
ON (a2.id = a1.id + 1)
WHERE a1.allocation != a2.allocation;

This gives you a table having "the end of the previous period", "the start of the next period", and "the value of 'allocation' in the previous period" in each row:

+------+------------+------------+------------+
| idx  | prev_end   | next_start | allocation |
+------+------------+------------+------------+
|    1 | 2012-01-01 | 2012-01-02 |          0 |
|    2 | 2012-01-02 | 2012-01-03 |          2 |
|    3 | 2012-01-05 | 2012-01-06 |          0 |
+------+------------+------------+------------+

2) We need the start and end of each period in the same row, so we need to combine adjacent rows again. Do this by creating a second temporary table like boundaries but having an idx field 1 greater:

+------+------------+------------+
| idx  | prev_end   | next_start |
+------+------------+------------+
|    2 | 2012-01-01 | 2012-01-02 |
|    3 | 2012-01-02 | 2012-01-03 |
|    4 | 2012-01-05 | 2012-01-06 |
+------+------------+------------+

Now join on the idx field and we get the answer:

SELECT
  boundaries2.next_start AS start,
  boundaries.prev_end AS end,
  allocation
FROM boundaries
JOIN boundaries2
USING(idx);

+------------+------------+------------+
| start      | end        | allocation |
+------------+------------+------------+
| 2012-01-02 | 2012-01-02 |          2 |
| 2012-01-03 | 2012-01-05 |          0 |
+------------+------------+------------+

** Note that this answer gets the "internal" periods correctly but misses the two "edge" periods where allocation = 0 at the beginning and allocation = 5 at the end. Those can be pulled in using UNION clauses but I wanted to present the core idea without that complication.



回答2:

Following would be one way to do it. The gist of this solution is

  • Use a CTE to get a list of all consecutive start and enddates with Allocation = 0
  • Use the ROW_NUMBER window function to assign rownumbers depending on both start- and enddates.
  • Select only those records where both ROW_NUMBERS equal 1.
  • Use DATEDIFFto calculate the DayCount

SQL Statement

;WITH r AS (
  SELECT  StartDate = Date, EndDate = Date
  FROM    YourTable
  WHERE   Allocation = 0
  UNION ALL
  SELECT  r.StartDate, q.Date
  FROM    r
          INNER JOIN YourTable q ON DATEDIFF(dd, r.EndDate, q.Date) = 1
  WHERE   q.Allocation = 0          
)
SELECT  [Start Date] = s.StartDate
        , [End Date ] = s.EndDate
        , [DayCount] = DATEDIFF(dd, s.StartDate, s.EndDate) + 1
FROM    (
          SELECT  *
                  , rn1 = ROW_NUMBER() OVER (PARTITION BY StartDate ORDER BY EndDate DESC)
                  , rn2 = ROW_NUMBER() OVER (PARTITION BY EndDate ORDER BY StartDate ASC)
          FROM    r          
        ) s
WHERE   s.rn1 = 1
        AND s.rn2 = 1
OPTION  (MAXRECURSION 0)

Test script

;WITH q (ID, Date, Allocation) AS (
  SELECT * FROM (VALUES
    (1, '2012-01-01', 0)
    , (2, '2012-01-02', 2)
    , (3, '2012-01-03', 0)
    , (4, '2012-01-04', 0)
    , (5, '2012-01-05', 0)
    , (6, '2012-01-06', 5)
  ) a (a, b, c)
)
, r AS (
  SELECT  StartDate = Date, EndDate = Date
  FROM    q
  WHERE   Allocation = 0
  UNION ALL
  SELECT  r.StartDate, q.Date
  FROM    r
          INNER JOIN q ON DATEDIFF(dd, r.EndDate, q.Date) = 1
  WHERE   q.Allocation = 0          
)
SELECT  s.StartDate, s.EndDate, DATEDIFF(dd, s.StartDate, s.EndDate) + 1
FROM    (
          SELECT  *
                  , rn1 = ROW_NUMBER() OVER (PARTITION BY StartDate ORDER BY EndDate DESC)
                  , rn2 = ROW_NUMBER() OVER (PARTITION BY EndDate ORDER BY StartDate ASC)
          FROM    r          
        ) s
WHERE   s.rn1 = 1
        AND s.rn2 = 1
OPTION  (MAXRECURSION 0)


回答3:

Alternative way with CTE but without ROW_NUMBER(),

Sample data:

if object_id('tempdb..#tab') is not null
    drop table #tab

create table #tab (id int, date datetime, allocation int)

insert into #tab
select 1, '2012-01-01', 0 union
select 2, '2012-01-02', 2 union
select 3, '2012-01-03', 0 union
select 4, '2012-01-04', 0 union
select 5, '2012-01-05', 0 union
select 6, '2012-01-06', 5 union
select 7, '2012-01-07', 0 union
select 8, '2012-01-08', 5 union
select 9, '2012-01-09', 0 union
select 10, '2012-01-10', 0

Query:

;with cte(s_id, e_id, b_id) as (
    select s.id, e.id, b.id
    from #tab s
    left join #tab e on dateadd(dd, 1, s.date) = e.date and e.allocation = 0
    left join #tab b on dateadd(dd, -1, s.date) = b.date and b.allocation = 0
    where s.allocation = 0
)
select ts.date as [start date], te.date as [end date], count(*) as [day count] from (
    select c1.s_id as s, (
        select min(s_id) from cte c2 
        where c2.e_id is null and c2.s_id >= c1.s_id
    ) as e
    from cte c1
    where b_id is null
) t
join #tab t1 on t1.id between t.s and t.e and t1.allocation = 0
join #tab ts on ts.id = t.s
join #tab te on te.id = t.e
group by t.s, t.e, ts.date, te.date

Live example at data.SE.



回答4:

Using this sample data:

CREATE TABLE MyTable (ID INT, Date DATETIME, Allocation INT);
INSERT INTO MyTable VALUES (1, {d '2012-01-01'}, 0);
INSERT INTO MyTable VALUES (2, {d '2012-01-02'}, 2);
INSERT INTO MyTable VALUES (3, {d '2012-01-03'}, 0);
INSERT INTO MyTable VALUES (4, {d '2012-01-04'}, 0);
INSERT INTO MyTable VALUES (5, {d '2012-01-05'}, 0);
INSERT INTO MyTable VALUES (6, {d '2012-01-06'}, 5);
GO

Try this:

WITH DateGroups (ID, Date, Allocation, SeedID) AS (
    SELECT MyTable.ID, MyTable.Date, MyTable.Allocation, MyTable.ID
      FROM MyTable
      LEFT JOIN MyTable Prev ON Prev.Date = DATEADD(d, -1, MyTable.Date) 
                            AND Prev.Allocation = 0
     WHERE Prev.ID IS NULL
       AND MyTable.Allocation = 0
    UNION ALL
    SELECT MyTable.ID, MyTable.Date, MyTable.Allocation, DateGroups.SeedID
      FROM MyTable
      JOIN DateGroups ON MyTable.Date = DATEADD(d, 1, DateGroups.Date)
     WHERE MyTable.Allocation = 0

), StartDates (ID, StartDate, DayCount) AS (
    SELECT SeedID, MIN(Date), COUNT(ID)
      FROM DateGroups
     GROUP BY SeedID

), EndDates (ID, EndDate) AS (
    SELECT SeedID, MAX(Date)
      FROM DateGroups
     GROUP BY SeedID

)
SELECT StartDates.StartDate, EndDates.EndDate, StartDates.DayCount
  FROM StartDates
  JOIN EndDates ON StartDates.ID = EndDates.ID;

The first section of the query is a recursive SELECT, which is anchored by all rows that are allocation = 0, and whose previous day either doesn't exist or has allocation != 0. This effectively returns IDs: 1 and 3 which are the starting dates of the periods of time you want to return.

The recursive part of this same query starts from the anchor rows, and finds all subsequent dates that also have allocation = 0. The SeedID keeps track of the anchored ID through all the iterations.

The result so far is this:

ID          Date                    Allocation  SeedID
----------- ----------------------- ----------- -----------
1           2012-01-01 00:00:00.000 0           1
3           2012-01-03 00:00:00.000 0           3
4           2012-01-04 00:00:00.000 0           3
5           2012-01-05 00:00:00.000 0           3

The next sub query uses a simple GROUP BY to filter out all the start dates for each SeedID, and also counts the days.

The last sub query does the same thing with the end dates, but this time the day count isn't needed as we already have this.

The final SELECT query joins these two together to combine the start and end dates, and returns them along with the day count.



回答5:

Give it a try if it works for you Here SDATE for your DATE remains same as your table.

SELECT SDATE,
CASE WHEN (SELECT COUNT(*)-1 FROM TABLE1 WHERE ID BETWEEN TBL1.ID AND (SELECT MIN(ID) FROM TABLE1 WHERE ID > TBL1.ID AND ALLOCATION!=0)) >0 THEN(
CASE WHEN (SELECT SDATE FROM TABLE1 WHERE ID =(SELECT MAX(ID) FROM TABLE1 WHERE ID >TBL1.ID AND ID<(SELECT MIN(ID) FROM TABLE1 WHERE ID > TBL1.ID AND ALLOCATION!=0))) IS NULL THEN SDATE
ELSE (SELECT SDATE FROM TABLE1 WHERE ID =(SELECT MAX(ID) FROM TABLE1 WHERE ID >TBL1.ID AND ID<(SELECT MIN(ID) FROM TABLE1 WHERE ID > TBL1.ID AND ALLOCATION!=0))) END
)ELSE (SELECT SDATE FROM TABLE1 WHERE ID = (SELECT MAX(ID) FROM TABLE1 WHERE ID > TBL1.ID ))END AS EDATE
,CASE WHEN (SELECT COUNT(*)-1 FROM TABLE1 WHERE ID BETWEEN TBL1.ID AND (SELECT MIN(ID) FROM TABLE1 WHERE ID > TBL1.ID AND ALLOCATION!=0)) <0 THEN 
(SELECT COUNT(*) FROM TABLE1 WHERE ID BETWEEN TBL1.ID AND (SELECT MAX(ID) FROM TABLE1 WHERE ID > TBL1.ID )) ELSE
(SELECT COUNT(*)-1 FROM TABLE1 WHERE ID BETWEEN TBL1.ID AND (SELECT MIN(ID) FROM TABLE1 WHERE ID > TBL1.ID AND ALLOCATION!=0)) END AS DAYCOUNT
FROM TABLE1 TBL1 WHERE ALLOCATION = 0
AND (((SELECT ALLOCATION FROM TABLE1 WHERE ID=(SELECT MAX(ID) FROM TABLE1  WHERE ID < TBL1.ID))<> 0 ) OR (SELECT MAX(ID) FROM TABLE1  WHERE ID < TBL1.ID)IS NULL); 


回答6:

A solution without CTE:

SELECT a.aDate AS StartDate
    , MIN(c.aDate) AS EndDate
    , (datediff(day, a.aDate, MIN(c.aDate)) + 1) AS DayCount
FROM (
    SELECT x.aDate, x.allocation, COUNT(*) idn FROM table1 x
    JOIN table1 y ON y.aDate <= x.aDate
    GROUP BY x.id, x.aDate, x.allocation
) AS a
LEFT JOIN (
    SELECT x.aDate, x.allocation, COUNT(*) idn FROM table1 x
    JOIN table1 y ON y.aDate <= x.aDate
    GROUP BY x.id, x.aDate, x.allocation
) AS b ON a.idn = b.idn + 1 AND b.allocation = a.allocation
LEFT JOIN (
    SELECT x.aDate, x.allocation, COUNT(*) idn FROM table1 x
    JOIN table1 y ON y.aDate <= x.aDate
    GROUP BY x.id, x.aDate, x.allocation
) AS c ON a.idn <= c.idn AND c.allocation = a.allocation
LEFT JOIN (
    SELECT x.aDate, x.allocation, COUNT(*) idn FROM table1 x
    JOIN table1 y ON y.aDate <= x.aDate
    GROUP BY x.id, x.aDate, x.allocation
) AS d ON c.idn = d.idn - 1 AND d.allocation = c.allocation
WHERE b.idn IS NULL AND c.idn IS NOT NULL AND d.idn IS NULL AND a.allocation = 0
GROUP BY a.aDate

Example