Sunday, December 19, 2010

Finding the most common element in an array using SAS

Found a need for this a while back - nothing fancy here - if there is a tie only the first most commonly occuring element is listed.

/* Create a data set of 1000 records. An array of 10 elements is also created - we want to find the most commonly occuring element */
data temp;
array a(*) a1 - a10;

do id = 1 to 1000;
do j = 1 to 10;
call streaminit(215582 + id * j);
a(j) = round(rand('normal',5,4)) ;
end;
output;
end;
drop j;
run;

data temp;
set temp;

array a(*) a1-a10;
call sortn(of a(*));
count = 0; mode_count = 0;
do i = 1 to dim(a)-1;
if a(i) = a(i+1) and a(i) ne . and a(i+1) ne . then do;
find_first = a(i);
count = count + 1;
end;
else if a(i) ^= a(i+1) and a(i) ne . and a(i+1) ne . then do;
if count > mode_count then do;
mode = find_first;
mode_count = count;
end;
count = 0;
end;
end;
/* The last elements are the most frequently occuring once we reach the end of the array and a mode has not yet been found */
if mode = . and count > 1 then do;
mode = a(dim(a));
mode_count = count;
end;
/* If we reach the end of the array and count is greater than mode count then this must also be the most frequently occuring */
if count > mode_count then do;
mode = a(dim(a));
mode_count = count;
end;
drop i;
run;

/* Now let's check if it finds the correct ones */
proc transpose data = temp out=ttemp prefix=id;
id id;
var a1-a10;
run;

proc means data = ttemp mode noprint;
var id1-id1000;
output out = checkmode(drop = _type_ _freq_) mode=cmode1-cmode1000;
run;

proc transpose data = checkmode out = c prefix=checkmode;
var cmode1-cmode1000;
run;

data c;
set c(drop = _name_);
id = _N_;
run;

data ctemp;
merge temp c; by id;
if checkmode1 ne mode then flag = 1;
run;

data wrong(drop = find_first flag);
set ctemp;
where flag = 1;
run;

/* In macro form */
%macro array_mode(arrayname=);
call sortn(of &arrayname(*));
count = 0; mode_count = 0;
do i = 1 to dim(&arrayname)-1;
if &arrayname(i) = &arrayname(i+1) and &arrayname(i) ne . and &arrayname(i+1) ne . then do;
find_first = &arrayname(i);
count = count + 1;
end;
else if &arrayname(i) ^= &arrayname(i+1) and &arrayname(i) ne . and &arrayname(i+1) ne . then do;
if count > mode_count then do;
mode = find_first;
mode_count = count;
end;
count = 0;
end;
end;
/* The last elements are the most frequently occuring once we reach the end of the array and a mode has not yet been found */
if mode = . and count > 1 then do;
mode = &arrayname(dim(&arrayname));
mode_count = count;
end;
/* If we reach the end of the array and count is greater than mode count then this must also be the most frequently occuring */
if count > mode_count then do;
mode = &arrayname(dim(&arrayname));
mode_count = count;
end;
drop i;
%mend array_mode;

No comments: