udf和udaf

UDF

package fang.qiang;

import org.apache.hadoop.hive.ql.exec.UDF;

public class helloUDF extends UDF {
    public String evaluate(String str) {
        try {
            return "HelloWorld " + str;
        } catch (Exception e) {
            return null;
        }
    }
} 


将该java文件编译成helloudf.jar

hive> add jar helloudf.jar;
hive> create temporary function helloworld as 'fang.qiang.helloUDF';
hive> select helloworld(t.col1) from t limit 10;
hive> drop temporary function helloworld;

UDAF

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;

/**
 * 计算平均值
 */
public final class UDAFExampleAvg extends UDAF {

    /**
     * 定义一个类保存中间过程结果
     */
    public static class UDAFAvgState {
        private long mCount;
        private double mSum;
    }

    /**
     * 这是真正的实现聚合的类，hive自动从UDAF类中寻找继承了UDAFEvaluator的内部类
     */
    public static class UDAFExampleAvgEvaluator implements UDAFEvaluator {

        UDAFAvgState state;

        public UDAFExampleAvgEvaluator() {
            super();
            state = new UDAFAvgState();
            init();
        }

        /**
         * Reset the state of the aggregation.
         */
        public void init() {
            state.mSum = 0;
            state.mCount = 0;
        }

        /**
         * 对聚合组中每个行进行迭代，参数个数，类型要和hql中提供的一致
         *
         * 该函数任何时候 return true.
         */
        public boolean iterate(Integer o) {
            if (o != null) {
                state.mSum += o;
                state.mCount++;
            }
            return true;
        }

        /**
         * 因为hsql最后也是转换成mapreduce模型，所以一个聚合组可能存在于多个map中，这个函数对单个map的结果进行汇总
         */
        public UDAFAvgState terminatePartial() {
            // This is SQL standard - average of zero items should be null.
            return state.mCount == 0 ? null : state;
        }

        /**
         * 这个阶段是对应的reduce阶段，对每个map的汇总结果进程再处理
         *
         * 这个函数只会有一个参数，且必须和terminatePartial()函数的返回值类型相同
         */
        public boolean merge(UDAFAvgState o) {
            if (o != null) {
                state.mSum += o.mSum;
                state.mCount += o.mCount;
            }
            return true;
        }

        /**
         * 返回最终结果
         */
        public Double terminate() {
            // This is SQL standard - average of zero items should be null.
            return state.mCount == 0 ? null : Double.valueOf(state.mSum
                    / state.mCount);
        }
    }

    private UDAFExampleAvg() {
        // prevent instantiation
    }

}


将该java文件编译成avj.jar

hive> add jar avg.jar;
hive> create temporary function avg_example as 'UDAFExampleAvg';
hive> select avg_example(t.col1) from t group by id;
hive> drop temporary function helloworld;