Spark SQL----内置函数Aggregate Functions
SELECT any(col) FROM VALUES (true), (false), (false) AS tab(col);
+
|any(col)|
+
| true|
+
SELECT any(col) FROM VALUES (NULL), (true), (false) AS tab(col);
+
|any(col)|
+
| true|
+
SELECT any(col) FROM VALUES (false), (false), (NULL) AS tab(col);
+
|any(col)|
+
| false|
+
SELECT any_value(col) FROM VALUES (10), (5), (20) AS tab(col);
+
|any_value(col)|
+
| 10|
+
SELECT any_value(col) FROM VALUES (NULL), (5), (20) AS tab(col);
+
|any_value(col)|
+
| NULL|
+
SELECT any_value(col, true) FROM VALUES (NULL), (5), (20) AS tab(col);
+
|any_value(col)|
+
| 5|
+
SELECT approx_count_distinct(col1) FROM VALUES (1), (1), (2), (2), (3) tab(col1);
+
|approx_count_distinct(col1)|
+
| 3|
+
SELECT approx_percentile(col, array(0.5, 0.4, 0.1), 100) FROM VALUES (0), (1), (2), (10) AS tab(col);
+
|approx_percentile(col, array(0.5, 0.4, 0.1), 100)|
+
| [1, 1, 0]|
+
SELECT approx_percentile(col, 0.5, 100) FROM VALUES (0), (6), (7), (9), (10) AS tab(col);
+
|approx_percentile(col, 0.5, 100)|
+
| 7|
+
SELECT approx_percentile(col, 0.5, 100) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '1' MONTH), (INTERVAL '2' MONTH), (INTERVAL '10' MONTH) AS tab(col);
+
|approx_percentile(col, 0.5, 100)|
+
| INTERVAL '1' MONTH|
+
SELECT approx_percentile(col, array(0.5, 0.7), 100) FROM VALUES (INTERVAL '0' SECOND), (INTERVAL '1' SECOND), (INTERVAL '2' SECOND), (INTERVAL '10' SECOND) AS tab(col);
+
|approx_percentile(col, array(0.5, 0.7), 100)|
+
| [INTERVAL '01' SE...|
+
SELECT array_agg(col) FROM VALUES (1), (2), (1) AS tab(col);
+
|collect_list(col)|
+
| [1, 2, 1]|
+
SELECT avg(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|avg(col)|
+
| 2.0|
+
SELECT avg(col) FROM VALUES (1), (2), (NULL) AS tab(col);
+
|avg(col)|
+
| 1.5|
+
SELECT bit_and(col) FROM VALUES (3), (5) AS tab(col);
+
|bit_and(col)|
+
| 1|
+
SELECT bit_or(col) FROM VALUES (3), (5) AS tab(col);
+
|bit_or(col)|
+
| 7|
+
SELECT bit_xor(col) FROM VALUES (3), (5) AS tab(col);
+
|bit_xor(col)|
+
| 6|
+
SELECT substring(hex(bitmap_construct_agg(bitmap_bit_position(col))), 0, 6) FROM VALUES (1), (2), (3) AS tab(col);
+
|substring(hex(bitmap_construct_agg(bitmap_bit_position(col))), 0, 6)|
+
| 070000|
+
SELECT substring(hex(bitmap_construct_agg(bitmap_bit_position(col))), 0, 6) FROM VALUES (1), (1), (1) AS tab(col);
+
|substring(hex(bitmap_construct_agg(bitmap_bit_position(col))), 0, 6)|
+
| 010000|
+
SELECT substring(hex(bitmap_or_agg(col)), 0, 6) FROM VALUES (X '10'), (X '20'), (X '40') AS tab(col);
+
|substring(hex(bitmap_or_agg(col)), 0, 6)|
+
| 700000|
+
SELECT substring(hex(bitmap_or_agg(col)), 0, 6) FROM VALUES (X '10'), (X '10'), (X '10') AS tab(col);
+
|substring(hex(bitmap_or_agg(col)), 0, 6)|
+
| 100000|
+
SELECT bool_and(col) FROM VALUES (true), (true), (true) AS tab(col);
+
|bool_and(col)|
+
| true|
+
SELECT bool_and(col) FROM VALUES (NULL), (true), (true) AS tab(col);
+
|bool_and(col)|
+
| true|
+
SELECT bool_and(col) FROM VALUES (true), (false), (true) AS tab(col);
+
|bool_and(col)|
+
| false|
+
SELECT bool_or(col) FROM VALUES (true), (false), (false) AS tab(col);
+
|bool_or(col)|
+
| true|
+
SELECT bool_or(col) FROM VALUES (NULL), (true), (false) AS tab(col);
+
|bool_or(col)|
+
| true|
+
SELECT bool_or(col) FROM VALUES (false), (false), (NULL) AS tab(col);
+
|bool_or(col)|
+
| false|
+
SELECT collect_list(col) FROM VALUES (1), (2), (1) AS tab(col);
+
|collect_list(col)|
+
| [1, 2, 1]|
+
SELECT collect_set(col) FROM VALUES (1), (2), (1) AS tab(col);
+
|collect_set(col)|
+
| [1, 2]|
+
SELECT corr(c1, c2) FROM VALUES (3, 2), (3, 3), (6, 4) as tab(c1, c2);
+
| corr(c1, c2)|
+
|0.8660254037844387|
+
SELECT count(*) FROM VALUES (NULL), (5), (5), (20) AS tab(col);
+
|count(1)|
+
| 4|
+
SELECT count(col) FROM VALUES (NULL), (5), (5), (20) AS tab(col);
+
|count(col)|
+
| 3|
+
SELECT count(DISTINCT col) FROM VALUES (NULL), (5), (5), (10) AS tab(col);
+
|count(DISTINCT col)|
+
| 2|
+
SELECT count_if(col % 2 = 0) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col);
+
|count_if(((col % 2) = 0))|
+
| 2|
+
SELECT count_if(col IS NULL) FROM VALUES (NULL), (0), (1), (2), (3) AS tab(col);
+
|count_if((col IS NULL))|
+
| 1|
+
SELECT hex(count_min_sketch(col, 0.5d, 0.5d, 1)) FROM VALUES (1), (2), (1) AS tab(col);
+
|hex(count_min_sketch(col, 0.5, 0.5, 1))|
+
| 00000001000000000...|
+
SELECT covar_pop(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2);
+
| covar_pop(c1, c2)|
+
|0.6666666666666666|
+
SELECT covar_samp(c1, c2) FROM VALUES (1,1), (2,2), (3,3) AS tab(c1, c2);
+
|covar_samp(c1, c2)|
+
| 1.0|
+
SELECT every(col) FROM VALUES (true), (true), (true) AS tab(col);
+
|every(col)|
+
| true|
+
SELECT every(col) FROM VALUES (NULL), (true), (true) AS tab(col);
+
|every(col)|
+
| true|
+
SELECT every(col) FROM VALUES (true), (false), (true) AS tab(col);
+
|every(col)|
+
| false|
+
SELECT first(col) FROM VALUES (10), (5), (20) AS tab(col);
+
|first(col)|
+
| 10|
+
SELECT first(col) FROM VALUES (NULL), (5), (20) AS tab(col);
+
|first(col)|
+
| NULL|
+
SELECT first(col, true) FROM VALUES (NULL), (5), (20) AS tab(col);
+
|first(col)|
+
| 5|
+
SELECT first_value(col) FROM VALUES (10), (5), (20) AS tab(col);
+
|first_value(col)|
+
| 10|
+
SELECT first_value(col) FROM VALUES (NULL), (5), (20) AS tab(col);
+
|first_value(col)|
+
| NULL|
+
SELECT first_value(col, true) FROM VALUES (NULL), (5), (20) AS tab(col);
+
|first_value(col)|
+
| 5|
+
SELECT name, grouping(name), sum(age) FROM VALUES (2, 'Alice'), (5, 'Bob') people(age, name) GROUP BY cube(name);
+
| name|grouping(name)|sum(age)|
+
| NULL| 1| 7|
|Alice| 0| 2|
| Bob| 0| 5|
+
SELECT name, grouping_id(), sum(age), avg(height) FROM VALUES (2, 'Alice', 165), (5, 'Bob', 180) people(age, name, height) GROUP BY cube(name, height);
+
| name|grouping_id()|sum(age)|avg(height)|
+
| NULL| 2| 2| 165.0|
|Alice| 0| 2| 165.0|
|Alice| 1| 2| 165.0|
| NULL| 3| 7| 172.5|
| Bob| 1| 5| 180.0|
| Bob| 0| 5| 180.0|
| NULL| 2| 5| 180.0|
+
SELECT histogram_numeric(col, 5) FROM VALUES (0), (1), (2), (10) AS tab(col);
+
|histogram_numeric(col, 5)|
+
| [{0, 1.0}, {1, 1....|
+
SELECT hll_sketch_estimate(hll_sketch_agg(col, 12)) FROM VALUES (1), (1), (2), (2), (3) tab(col);
+
|hll_sketch_estimate(hll_sketch_agg(col, 12))|
+
| 3|
+
SELECT hll_sketch_estimate(hll_union_agg(sketch, true)) FROM (SELECT hll_sketch_agg(col) as sketch FROM VALUES (1) tab(col) UNION ALL SELECT hll_sketch_agg(col, 20) as sketch FROM VALUES (1) tab(col));
+
|hll_sketch_estimate(hll_union_agg(sketch, true))|
+
| 1|
+
SELECT kurtosis(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col);
+
| kurtosis(col)|
+
|-0.7014368047529618|
+
SELECT kurtosis(col) FROM VALUES (1), (10), (100), (10), (1) as tab(col);
+
| kurtosis(col)|
+
|0.19432323191698986|
+
SELECT last(col) FROM VALUES (10), (5), (20) AS tab(col);
+
|last(col)|
+
| 20|
+
SELECT last(col) FROM VALUES (10), (5), (NULL) AS tab(col);
+
|last(col)|
+
| NULL|
+
SELECT last(col, true) FROM VALUES (10), (5), (NULL) AS tab(col);
+
|last(col)|
+
| 5|
+
SELECT last_value(col) FROM VALUES (10), (5), (20) AS tab(col);
+
|last_value(col)|
+
| 20|
+
SELECT last_value(col) FROM VALUES (10), (5), (NULL) AS tab(col);
+
|last_value(col)|
+
| NULL|
+
SELECT last_value(col, true) FROM VALUES (10), (5), (NULL) AS tab(col);
+
|last_value(col)|
+
| 5|
+
SELECT max(col) FROM VALUES (10), (50), (20) AS tab(col);
+
|max(col)|
+
| 50|
+
SELECT max_by(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
+
|max_by(x, y)|
+
| b|
+
SELECT mean(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|mean(col)|
+
| 2.0|
+
SELECT mean(col) FROM VALUES (1), (2), (NULL) AS tab(col);
+
|mean(col)|
+
| 1.5|
+
SELECT median(col) FROM VALUES (0), (10) AS tab(col);
+
|median(col)|
+
| 5.0|
+
SELECT median(col) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
+
| median(col)|
+
|INTERVAL '0-5' YE...|
+
SELECT min(col) FROM VALUES (10), (-1), (20) AS tab(col);
+
|min(col)|
+
| -1|
+
SELECT min_by(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
+
|min_by(x, y)|
+
| a|
+
SELECT mode(col) FROM VALUES (0), (10), (10) AS tab(col);
+
|mode(col)|
+
| 10|
+
SELECT mode(col) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH), (INTERVAL '10' MONTH) AS tab(col);
+
| mode(col)|
+
|INTERVAL '10' MONTH|
+
SELECT mode(col) FROM VALUES (0), (10), (10), (null), (null), (null) AS tab(col);
+
|mode(col)|
+
| 10|
+
SELECT percentile(col, 0.3) FROM VALUES (0), (10) AS tab(col);
+
|percentile(col, 0.3, 1)|
+
| 3.0|
+
SELECT percentile(col, array(0.25, 0.75)) FROM VALUES (0), (10) AS tab(col);
+
|percentile(col, array(0.25, 0.75), 1)|
+
| [2.5, 7.5]|
+
SELECT percentile(col, 0.5) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '10' MONTH) AS tab(col);
+
|percentile(col, 0.5, 1)|
+
| INTERVAL '0-5' YE...|
+
SELECT percentile(col, array(0.2, 0.5)) FROM VALUES (INTERVAL '0' SECOND), (INTERVAL '10' SECOND) AS tab(col);
+
|percentile(col, array(0.2, 0.5), 1)|
+
| [INTERVAL '0 00:0...'|
+
SELECT percentile_approx(col, array(0.5, 0.4, 0.1), 100) FROM VALUES (0), (1), (2), (10) AS tab(col);
+
|percentile_approx(col, array(0.5, 0.4, 0.1), 100)|
+
| [1, 1, 0]|
+
SELECT percentile_approx(col, 0.5, 100) FROM VALUES (0), (6), (7), (9), (10) AS tab(col);
+
|percentile_approx(col, 0.5, 100)|
+
| 7|
+
SELECT percentile_approx(col, 0.5, 100) FROM VALUES (INTERVAL '0' MONTH), (INTERVAL '1' MONTH), (INTERVAL '2' MONTH), (INTERVAL '10' MONTH) AS tab(col);
+
|percentile_approx(col, 0.5, 100)|
+
| INTERVAL '1' MONTH|
+
SELECT percentile_approx(col, array(0.5, 0.7), 100) FROM VALUES (INTERVAL '0' SECOND), (INTERVAL '1' SECOND), (INTERVAL '2' SECOND), (INTERVAL '10' SECOND) AS tab(col);
+
|percentile_approx(col, array(0.5, 0.7), 100)|
+
| [INTERVAL '01' SE...|
+
SELECT regr_avgx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
|regr_avgx(y, x)|
+
| 2.75|
+
SELECT regr_avgx(y, x) FROM VALUES (1, null) AS tab(y, x);
+
|regr_avgx(y, x)|
+
| NULL|
+
SELECT regr_avgx(y, x) FROM VALUES (null, 1) AS tab(y, x);
+
|regr_avgx(y, x)|
+
| NULL|
+
SELECT regr_avgx(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
|regr_avgx(y, x)|
+
| 3.0|
+
SELECT regr_avgx(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_avgx(y, x)|
+
| 3.0|
+
SELECT regr_avgy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
|regr_avgy(y, x)|
+
| 1.75|
+
SELECT regr_avgy(y, x) FROM VALUES (1, null) AS tab(y, x);
+
|regr_avgy(y, x)|
+
| NULL|
+
SELECT regr_avgy(y, x) FROM VALUES (null, 1) AS tab(y, x);
+
|regr_avgy(y, x)|
+
| NULL|
+
SELECT regr_avgy(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
| regr_avgy(y, x)|
+
|1.6666666666666667|
+
SELECT regr_avgy(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_avgy(y, x)|
+
| 1.5|
+
SELECT regr_count(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
|regr_count(y, x)|
+
| 4|
+
SELECT regr_count(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
|regr_count(y, x)|
+
| 3|
+
SELECT regr_count(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_count(y, x)|
+
| 2|
+
SELECT regr_intercept(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x);
+
|regr_intercept(y, x)|
+
| 0.0|
+
SELECT regr_intercept(y, x) FROM VALUES (1, null) AS tab(y, x);
+
|regr_intercept(y, x)|
+
| NULL|
+
SELECT regr_intercept(y, x) FROM VALUES (null, 1) AS tab(y, x);
+
|regr_intercept(y, x)|
+
| NULL|
+
SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
| regr_r2(y, x)|
+
|0.2727272727272726|
+
SELECT regr_r2(y, x) FROM VALUES (1, null) AS tab(y, x);
+
|regr_r2(y, x)|
+
| NULL|
+
SELECT regr_r2(y, x) FROM VALUES (null, 1) AS tab(y, x);
+
|regr_r2(y, x)|
+
| NULL|
+
SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
| regr_r2(y, x)|
+
|0.7500000000000001|
+
SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_r2(y, x)|
+
| 1.0|
+
SELECT regr_slope(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x);
+
|regr_slope(y, x)|
+
| 1.0|
+
SELECT regr_slope(y, x) FROM VALUES (1, null) AS tab(y, x);
+
|regr_slope(y, x)|
+
| NULL|
+
SELECT regr_slope(y, x) FROM VALUES (null, 1) AS tab(y, x);
+
|regr_slope(y, x)|
+
| NULL|
+
SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
| regr_sxx(y, x)|
+
|2.7499999999999996|
+
SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
|regr_sxx(y, x)|
+
| 2.0|
+
SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_sxx(y, x)|
+
| 2.0|
+
SELECT regr_sxy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
| regr_sxy(y, x)|
+
|0.7499999999999998|
+
SELECT regr_sxy(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
|regr_sxy(y, x)|
+
| 1.0|
+
SELECT regr_sxy(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_sxy(y, x)|
+
| 1.0|
+
SELECT regr_syy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x);
+
| regr_syy(y, x)|
+
|0.7499999999999999|
+
SELECT regr_syy(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS tab(y, x);
+
| regr_syy(y, x)|
+
|0.6666666666666666|
+
SELECT regr_syy(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4) AS tab(y, x);
+
|regr_syy(y, x)|
+
| 0.5|
+
SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col);
+
| skewness(col)|
+
|1.1135657469022013|
+
SELECT skewness(col) FROM VALUES (-1000), (-100), (10), (20) AS tab(col);
+
| skewness(col)|
+
|-1.1135657469022011|
+
SELECT some(col) FROM VALUES (true), (false), (false) AS tab(col);
+
|some(col)|
+
| true|
+
SELECT some(col) FROM VALUES (NULL), (true), (false) AS tab(col);
+
|some(col)|
+
| true|
+
SELECT some(col) FROM VALUES (false), (false), (NULL) AS tab(col);
+
|some(col)|
+
| false|
+
SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|std(col)|
+
| 1.0|
+
SELECT stddev(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|stddev(col)|
+
| 1.0|
+
SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col);
+
| stddev_pop(col)|
+
|0.816496580927726|
+
SELECT stddev_samp(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|stddev_samp(col)|
+
| 1.0|
+
SELECT sum(col) FROM VALUES (5), (10), (15) AS tab(col);
+
|sum(col)|
+
| 30|
+
SELECT sum(col) FROM VALUES (NULL), (10), (15) AS tab(col);
+
|sum(col)|
+
| 25|
+
SELECT sum(col) FROM VALUES (NULL), (NULL) AS tab(col);
+
|sum(col)|
+
| NULL|
+
SELECT try_avg(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|try_avg(col)|
+
| 2.0|
+
SELECT try_avg(col) FROM VALUES (1), (2), (NULL) AS tab(col);
+
|try_avg(col)|
+
| 1.5|
+
SELECT try_avg(col) FROM VALUES (interval '2147483647 months'), (interval '1 months') AS tab(col);
+
|try_avg(col)|
+
| NULL|
+
SELECT try_sum(col) FROM VALUES (5), (10), (15) AS tab(col);
+
|try_sum(col)|
+
| 30|
+
SELECT try_sum(col) FROM VALUES (NULL), (10), (15) AS tab(col);
+
|try_sum(col)|
+
| 25|
+
SELECT try_sum(col) FROM VALUES (NULL), (NULL) AS tab(col);
+
|try_sum(col)|
+
| NULL|
+
SELECT try_sum(col) FROM VALUES (9223372036854775807L), (1L) AS tab(col);
+
|try_sum(col)|
+
| NULL|
+
SELECT var_pop(col) FROM VALUES (1), (2), (3) AS tab(col);
+
| var_pop(col)|
+
|0.6666666666666666|
+
SELECT var_samp(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|var_samp(col)|
+
| 1.0|
+
SELECT variance(col) FROM VALUES (1), (2), (3) AS tab(col);
+
|variance(col)|
+
| 1.0|
+