Bootstrap

13. MapReduce自定义OutputFormat

一. OutputFormat简介

OutputFormat是MapReduce输出的基类,所有MapReduce输出都实现了OutputFormat接口,它接收ReduceTask产生的数据,然后将结果按照指定格式输出。

在MapReduce中,如果不指定,默认使用的是TextOutputFormat。但是在一些特定的场景下,默认的TextOutputFormat不一定能满足我们的需求,因此可以自定义OutputFormat来实现个性化需求。

二. 需求

使用MapReduce对输入文件中的单词进行计数,单词"hello"的计数结果输出到hello.log中,非"hello"的单词的计数结果输出到non-hello.log。

要实现上面的输出需求,就需要自定义OutputFormat。

自定义OutputFormat的步骤:

  1. 自定义一个类继承FileOutputFormat。
  2. 自定义一个类继承RecordWriter,重写方法write()和close()。

代码实现

package mr;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


class MultiOuputFormat extends FileOutputFormat<Text, IntWritable> {

    @Override
    public RecordWriter<Text, IntWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
        Configuration configuration = job.getConfiguration();
        String outputPath = configuration.get(FileOutputFormat.OUTDIR);
        FileSystem fs = FileSystem.get(configuration);

        Path path1 = new Path(outputPath + "/hello.log");
        Path path2 = new Path(outputPath + "/non-hello.log");
        if (fs.exists(path1)) {
            fs.delete(path1, true);
        }
        if (fs.exists(path2)) {
            fs.delete(path2, true);
        }
        FSDataOutputStream out1 = fs.create(path1);
        FSDataOutputStream out2 = fs.create(path2);

        return new MyRecordWriter(out1, out2);
    }
}


class MyRecordWriter extends RecordWriter<Text, IntWritable> {
    private FSDataOutputStream out1;
    private FSDataOutputStream out2;

    public MyRecordWriter(FSDataOutputStream out1, FSDataOutputStream out2) {
        super();
        this.out1 = out1;
        this.out2 = out2;
    }

    @Override
    public void write(Text key, IntWritable value) throws IOException, InterruptedException {
        String outStr = key.toString() + "," + value.toString() + "\n";
        if (key.toString().contains("hello")) {
            out1.write(outStr.getBytes());
        } else {
            out2.write(outStr.getBytes());
        }
    }

    @Override
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        IOUtils.close(out1);
        IOUtils.close(out2);
    }
}





public class WordCountOutputFormat {

    static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] words = value.toString().split(" ");
            for (String word: words) {
                context.write(new Text(word), new IntWritable(1));
            }
        }

    }

    static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            context.write(key, new IntWritable(sum));
        }

    }


    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(WordCountOutputFormat.class);
        job.setJobName("WordCount");

        // 设置输入,输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 设置Mapper
        job.setMapperClass(WordCountOutputFormat.WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 设置Reducer
        job.setReducerClass(WordCountOutputFormat.WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setNumReduceTasks(1);
        job.setOutputFormatClass(MultiOuputFormat.class);


        boolean waitFor = job.waitForCompletion(true);
        System.exit(waitFor ? 0 : 1);
    }
}

运行结果

[root@hadoop1 ~]# yarn jar learn-1.0-SNAPSHOT.jar  mr.WordCountOutputFormat  /test/a.txt  /output

# 查看输入文件
[root@hadoop1 ~]# hdfs dfs -text /test/a.txt
hello world
name hello
world

# 查看结果文件
[root@hadoop1 ~]# hdfs dfs -ls /output
Found 3 items
-rw-r--r--   3 root supergroup          0 2024-10-29 21:52 /output/_SUCCESS
-rw-r--r--   3 root supergroup          8 2024-10-29 21:52 /output/hello.log
-rw-r--r--   3 root supergroup         15 2024-10-29 21:52 /output/non-hello.log
[root@hadoop1 ~]# hdfs dfs -text /output/hello.log
hello,2
[root@hadoop1 ~]# hdfs dfs -text /output/non-hello.log
name,1
world,2
;