问题/疑问
我试图做一个简单的检查上的数据集中的变量(收入,成本,利润和vcosts)名单上,抓住最大,从每一个变量第二大检查,如果其总是大于90%变量的总和,如果是这样,标志该变量。 我还需要检查,最大的变量不超过总额的60%大。
我有一点帮助从这个宏与SAS表中的测试结果输出表 与SAS表中的测试结果输出表宏 ,但现在我想回答一个更基本的问题。 这似乎并不着急,但我不能在最后弄清楚如何设置基本表。
我知道所有的变量名。
下面是我创建了一个样本数据集: https://www.dropbox.com/s/x575w5d551uu47p/dataset%20%281%29.csv?dl=0
所需的输出
我想谈谈这个基本表:
到这样的另一个表:
重复的例子,
/* Create some dummy data with three variables to assess */
data have;
do firm = 1 to 3;
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
基于以前的答案您的评论。 它看起来像top_2_total是2项最大总价值的总和。 为了这个目的,你需要编写一些额外的步骤。 我使用PROC转和datastep得到什么在以前的答案已经acheieved。 我已经编写PROC内容让高层2最大总价值和重用数据集以创建最终的答案。 让我知道,如果它帮助。
data have;
do firm = 1 to 3;
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
proc transpose data=have out=want prefix=top_;
var revenue--vcost;
run;
data want;
set want end=eof;
array top(*) top_3-top_1;
call sortn(of top[*]);
total=sum(of top[*]);
run;
/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;
data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
top_2_total=sum(total_1,total_2);
set want;
if sum(top_1,top_2) > 0.9 * top_2_total then Flag90=1; else Flag90=0;
if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;
drop total_1 total_2;
run;
proc print data=want;run;
编辑:我添加了一个逻辑我PROC TRANSPOSE之前,您可以添加到考虑计算的变量,将其余的通过代码来完成。 将无需手动更改通过代码执行后进行。 该变量应输入为空格分隔的列表。
data have;
infile 'C:\dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;
/* add/remove columns here to consider variable */
%let variable_to_consider=v1
v2
v3
;
%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
select count(*) into : obs_count from have;
quit;
%let obs_count=&obs_count;
proc transpose data=have out=want prefix=top_;
var &variable_to_consider;
run;
data want;
set want end=eof;
array top(*) top_&obs_count.-top_1;
x=dim(top);
call sortn(of top[*]);
total=sum(of top[*]);
keep total top_1 top_2 _name_;
run;
/* Getting the maximum 2 total values using PROC SUMMARY*/
proc summary data=want nway;
output out=total_top_2_rec(drop=_:) idgroup(max(total) out[2](total)=);
run;
data want;
/* Loop to get the values from previous step and generate TOP_2_TOTAL variable */
if _n_=1 then set total_top_2_rec;
top_2_total=sum(total_1,total_2);
set want;
if sum(top_1,top_2) > 0.9 * top_2_total then Flag90=1; else Flag90=0;
if top_1 > top_2_total * 0.6 then Flag60=1; else Flag60=0;
drop total_1 total_2;
run;
proc print data=want;run;
编辑2014年4月5日:如所讨论的,我已经更新了逻辑和固定的问题。 下面是更新后的代码。
data have1;
do firm = 1 to 3;
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
data have2;
infile 'dataset (1).csv' missover dsd dlm=',' firstobs=2;
input firm v1 v2 v3;
run;
/* add/remove columns here to consider variable */
%macro mymacro(input_dataset= ,output_dataset=, variable_to_consider=);
%let variable_to_consider=%cmpres(&variable_to_consider);
proc sql noprint;
select count(*) into : obs_count from &input_dataset;
quit;
%let obs_count=&obs_count;
proc transpose data=&input_dataset out=&output_dataset prefix=top_;
var &variable_to_consider;
run;
data &output_dataset;
set &output_dataset end=eof;
array top(*) top_&obs_count.-top_1;
x=dim(top);
call sortn(of top[*]);
total=sum(of top[*]);
top_2_total=sum(top_1, top_2);
if sum(top_1,top_2) > 0.9 * total then Flag90=1; else Flag90=0;
if top_1 > total * 0.6 then Flag60=1; else Flag60=0;
keep total top_1 top_2 _name_ top_2_total total Flag60 Flag90;
run;
%mend mymacro;
%mymacro(input_dataset=have1, output_dataset=want1 ,variable_to_consider=revenue costs profits vcost)
%mymacro(input_dataset=have2, output_dataset=want2 ,variable_to_consider=v1 v2 v3 )
proc print data=want1;run;
proc print data=want2;run;
这里困难的部分是每个变量拉出顶2的值。 这是SQL的大多数实现简单,但是在SAS我不认为proc sql
支持select top n
语法。
我能想到的这样做的几种可能的方式:
排序由每个感兴趣的变量降序排列的数据集,从第2个观察值检索值,转,并追加他们一起 - 这非常低效的,由于多重排序,它并不比其他方法要简单得多。
写(相当复杂的)数据的步骤,以提取的顶部2的值对每一个变量。
获取PROC单变量提取为你顶值,然后转输出数据集中到正确的格式。
数据步法
data top2;
array v{4} revenue costs profits vcost;
array top1{4} (4*0);
array top2{4} (4*0);
set have end = eof;
do i = 1 to 4;
if v[i] > top1[i] then do;
top2[i] = top1[i];
top1[i] = v[i];
end;
if top2[i] < v[i] < top1[i] then top2[i] = v[i];
end;
length varname $8;
if eof then do i = 1 to 4;
varname = vname(v[i]);
top_1 = top1[i];
top_2 = top2[i];
top_2_total = top_1 + top_2;
output;
end;
keep varname top_:;
run;
PROC单变量的方法
ods _all_ close;
ods output extremeobs = extremeobs(keep = varname high);
proc univariate data = have(drop = firm);
run;
ods listing;
data top2_b;
set extremeobs;
by varname notsorted;
if first.varname then do;
i = 0;
call missing(top_2);
end;
i + 1;
retain top_2;
if i = 4 then top_2 = high;
if i = 5 then do;
top_1 = high;
top_2_total = top_1 + top_2;
output;
end;
drop i high;
run;
一旦你得到了这个你可以从PROC手段的/ proc总结现有的简单的表合并,并计算感兴趣的任何进一步措施。
在最后步骤中的FLAG1和FLAG2将具有用于与分母分子大于或等于零如果分子比分母以下的值的正整数。
data have(drop=firm);
do firm = 1 to 4;
VarName = 'Variable';
revenue = rand("uniform");
costs = rand("uniform");
profits = rand("uniform");
vcost = rand("uniform");
output;
end;
run;
Proc Transpose data=have out=transout
name=Variable
prefix=Var_;
run;
options Mprint;
%Macro calcflag(Varlist);
proc sql;
create table outtable as
select Variable,
sum(&Varlist) as Sum_var,
Largest(1,&Varlist) as Top_1,
Largest(2,&Varlist) as Top_2,
sum(Largest(1,&Varlist),Largest(2,&Varlist)) as Top_2_total,
floor(sum(Largest(1,&Varlist),Largest(2,&Varlist))/(sum(&Varlist)*0.9)) as flag1,
floor(Largest(1,&Varlist)/(sum(&Varlist)*0.6)) as flag2
from transout;
quit;
%mend;
%calcflag(%str(Var_1,Var_2,Var_3,Var_4));