I want to compute a cumulative max grouped by another column.
Say I have this data:
data have;
input grp $ number;
datalines;
a 3
b 4
a 5
b 2
a 1
b 8
;
My desired output would be:
data want;
input grp $ cummax;
a 3
b 4
a 5
b 4
a 5
b 8
;
My real case will involve several grouping columns + filters, and ideally this cumulative max would be computed on several columns at the same time.
My main concern is computational efficiency as I'll be running this on tables of ten to hundred millions of rows. Proc SQL or native SAS are both welcome.
Rows might be shuffled if necessary.
System Info
proc product_status;run;
For Base SAS Software ...
Custom version information: 9.3_M2
Image version information: 9.03.01M2P080112
For SAS/STAT ...
Custom version information: 12.1
Image version information: 9.03.01M0P081512
For SAS/GRAPH ...
Custom version information: 9.3_M2
For SAS/CONNECT ...
Custom version information: 9.3_M2
For SAS OLAP Server ...
Custom version information: 9.3_M1
For SAS Enterprise Miner ...
Custom version information: 12.1
Image version information: 9.03.01M0P081512
For SAS Integration Technologies ...
Custom version information: 9.3_M2
For SAS/ACCESS Interface to Oracle ...
Custom version information: 9.3_M1
For SAS/ACCESS Interface to PC Files ...
Custom version information: 9.3_M2
proc setinit;run;
Product expiration dates:
---Base SAS Software 31JUL2018
---SAS/STAT 31JUL2018
---SAS/GRAPH 31JUL2018
---SAS/CONNECT 31JUL2018
---SAS OLAP Server 31JUL2018
---SAS Enterprise Miner 31JUL2018
---MDDB Server common products 31JUL2018
---SAS Integration Technologies 31JUL2018
---SAS Enterprise Miner Server 31JUL2018
---SAS Enterprise Miner Client 31JUL2018
---Unused OLAP Slot 31JUL2018
---SAS Enterprise Guide 31JUL2018
---SAS/ACCESS Interface to Oracle 31JUL2018
---SAS/ACCESS Interface to PC Files 31JUL2018
---SAS Metadata Bridges for Informatica 31JUL2018
---SAS Metadata Bridges for Microsoft SQL Server 31JUL2018
---SAS Metadata Bridge for Oracle 31JUL2018
---SAS Workspace Server for Local Access 31JUL2018
---SAS Workspace Server for Enterprise Access 31JUL2018
---SAS Table Server 31JUL2018
---DataFlux Trans DB Driver 31JUL2018
---SAS Framework Data Server 31JUL2018
---SAS Add-in for Microsoft Excel 31JUL2018
---SAS Add-in for Microsoft Outlook 31JUL2018
---SAS Add-in for Microsoft PowerPoint 31JUL2018
---SAS Add-in for Microsoft Word 31JUL2018
Use a HASH
object to store the max for each variable and group combination. This will allow you to single pass through your data set and code something that you can scale for the number of groups and variables.
This does not require a sort which can be costly on a large data set.
Test Data
data example;
format grp1-grp5 $1.;
array grp[5];
array val[5];
do rows=1 to 1000000;
do i=1 to 5;
r = ceil(ranuni(1)*5);
grp[i] = substr("ABCDE",r,1);
end;
do j=1 to 5;
val[j] = 10*rannor(1);
end;
output;
end;
keep grp: val:;
run;
Data Step to compute the cumulative max
data want;
set example;
array val[5];
array max[5];
if _n_ = 1 then do;
declare hash mx();
rc = mx.defineKey('grp1','grp2','grp3','grp4','grp5');
rc = mx.definedata('max1','max2','max3','max4','max5');
rc = mx.definedone();
end;
rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
do i=1 to 5;
max[i] = val[i];
end;
end;
/*Update Max Values*/
do i=1 to 5;
if val[i] > max[i] then
max[i] = val[i];
end;
/*Update Hash*/
rc = mx.replace();
drop rc i;
n = _n_; /*This is for testing*/
run;
Using that testing variable n
, we can sort the groups keeping the original order and see if it worked. (hint, it did).
proc sort data=want;
by grp: n;
run;
proc sort data=have;
by grp;
run;
data want;
set have;
by grp;
retain max;
max=ifn(first.grp,number,max(number,max));
run;
Use Hash without sort
data want;
if _n_=1 then do;
declare hash h();
h.definekey('grp');
h.definedata('value');
h.definedone();
end;
set have;
if h.find()^=0 then do;
h.add(key:grp,data:number);
max=number;
end;
else do;
max=max(number,value);
h.replace(key:grp,data:number);
end;
drop value number;
run;
something like the following will work. If you want to keep the original order add a row counter and resort on that:
proc sort data=have;
by grp;
run;
data new;
drop newnum;
set have;
by grp;
retain newnum;
if first.grp then newnum = number;
if number > newnum then newnum=number;
else number=newnum;
run;
I built a macro function wrapped around @DomPazz 's solution, one can choose which columns to group by, which columns to compute on and which columns to drop or keep in the end.
I think included examples are straightforward.
I joined at the bottom the short convenience macro functions that I use in cummax
.
*------------------------------------------------------------;
* CUMMAX ;
* Compute a cumulative max on 1 or several variables grouped ;
* by one or several variables; ;
*------------------------------------------------------------;
/* EXAMPLE:
data have;
format grp1-grp2 $1.;
array grp[2];
array val[3];
do rows=1 to 20;
do i=1 to 2;
r = ceil(ranuni(1)*2);
grp[i] = substr("AB",r,1);
end;
do j=1 to 3;
val[j] = 10*rannor(1);
end;
output;
end;
keep grp: val:;
run;
%cummax(have,grp=grp1 grp2,val=val1 val2,out= want1)
%cummax(have,grp=grp1,val=val1,drop=grp2 val3,out= want2)
%cummax(have,grp=grp1,val=val1,keep= val2,out= want3)
*/
%macro cummax
(data /* source table */
,grp= /* variables to group on */
,val= /* variables to compute on */
,keep= /* variables to keep additionally to grp and computed columns, don't use with drop */
,drop= /* variables to drop, don't use with keep */
,out= /* output table */
);
/* default output */
%if not %length(&out) %then %let out = &data;
/* rework keep and drop */
%local n_val max_val;
%let n_val = %list_length(&val);
%let max_val = %list_fix(&val,suffix=_cmax);
%if %length(&keep) %then %let keep = (keep= &keep &grp &max_val );
%if %length(&drop) %then %let drop = (drop= &drop);
/* data step */
data &out&keep&drop;
set &data;
array val[&n_val] &val;
array max[&n_val] &max_val;
if _n_ = 1 then do;
declare hash mx();
rc = mx.defineKey(%list_quote_comma(&grp));
rc = mx.definedata(%list_quote_comma(&max_val));
rc = mx.definedone();
end;
rc = mx.find();
/*No Max for this combination -- add it*/
if rc then do;
do i=1 to &n_val; /* %list_length(&val) */
max[i] = val[i];
end;
end;
/*Update Max Values*/
do i=1 to &n_val;
if val[i] > max[i] then
max[i] = val[i];
end;
/*Update Hash*/
rc = mx.replace();
drop rc i;
run;
%mend;
*---------------------------------------------------------------;
* LIST_LENGTH ;
* Length of space separated list ;
*---------------------------------------------------------------;
/* EXAMPLES :
%put %list_length(item1 item2 item3);
*/
%macro list_length
(data
);
%sysfunc(countw(&data,%str( )))
%mend;
*---------------------------------------------------------------;
* LIST_QUOTE_COMMA ;
* create comma separated list with quoted items, from ;
* unquoted space separated list. ;
*---------------------------------------------------------------;
/* EXAMPLE
%put %list_quote_comma(a b c);
*/
%macro list_quote_comma
(data /* space separated list to quote */
);
%unquote(%str(%')%qsysfunc(tranwrd(&data,%str( ),%str(%',%')))%str(%'))
%mend;
*---------------------------------------------------------------;
* LIST_FIX ;
* Add prefix and/or suffix to items of space separated list ;
*---------------------------------------------------------------;
/* EXAMPLES :
%put %list_fix(item1 item2 item3,pref_,_suf);
%put %list_fix(item1 item2 item3,pref_);
%put %list_fix(item1 item2 item3,suffix=_suf);
*/
%macro list_fix
(data
,prefix
,suffix
);
%local output;
%do i=1 %to %sysfunc(countw(&data,%str( ))) ;
%let output= &output &prefix.%scan(&data,&i,%str( ))&suffix;
%end;
&output
%mend;