Tigergraph provides wonderful accumulators for min, max, and average, but I needed to calculate standard deviation and percentiles, and didn’t want to do it outside of my GSQL query. Here is my ugly, strange, but functional solution. Probably better ways to do it, but this is fast and it works. Thought I would share… comments welcome.
Note that when you pass in a value like 25, you get the 25th and 75th percentiles (top and bottom) just like you could pass 10 to get 90th and 10th percentile.
CREATE QUERY markStatistics(INT percentile = 25) {
TYPEDEF tuple<INT measure> hMeasure;
MapAccum<STRING, MaxAccum<INT>> @@measureMax;
MapAccum<STRING, MinAccum<INT>> @@measureMin;
MapAccum<STRING, AvgAccum> @@measureAvg;
MapAccum<STRING, AvgAccum> @@diffs;
MapAccum<STRING, INT> @@count;
HeapAccum<hMeasure> (percentile, measure DESC) @@percentileTop;
HeapAccum<hMeasure> (percentile, measure ASC) @@percentileBot;
H = {GeneralMeasure.*};
H1 = SELECT h FROM H:h
ACCUM @@measureMax += (h.generalMeasureName -> h.generalMeasure),
@@measureMin += (h.generalMeasureName -> h.generalMeasure),
@@measureAvg += (h.generalMeasureName -> h.generalMeasure),
@@count += (h.generalMeasureName -> 1)
POST-ACCUM
@@diffs += (h.generalMeasureName ->
pow(h.generalMeasure - @@measureAvg.get(h.generalMeasureName),2));
// Use the diffs to calculate std deviation
FOREACH (key,val) in @@diffs DO
@@percentileTop.resize(@@count.get(key) * percentile / 100);
@@percentileBot.resize(@@count.get(key) * percentile / 100);
H2 = SELECT h FROM H:h
WHERE h.generalMeasureName == key
ACCUM @@percentileTop += hMeasure(h.generalMeasure),
@@percentileBot += hMeasure(h.generalMeasure);
FOREACH i IN RANGE[2, @@percentileTop.size()] DO
@@percentileTop.pop();
@@percentileBot.pop();
END;
PRINT key, @@measureMin.get(key) as minval, @@measureMax.get(key) as maxval, sqrt(val) as stddev, @@measureAvg.get(key) as avgval, @@count.get(key) as countvals, @@percentileTop.top().measure as topPercentile, @@percentileBot.top().measure as bottomPercentile;
END;
}