问题:执行自己编译的jar包时,是否要先将这个jar拷到每个结点的相同路径下(可以通过hdfs来复制,也可通过scp)?
几个备忘:
1.为保证中文编码问题,先配置默认UTF8编码
2.对JDBC的Jar包处理,可以放每个节点上,也可放在hdfs上的lib
3.如有需要,对其它节点访问mysql授权brian(123456),相应IP
mysql> grant all on *.* to 'brian'@172.19.32.108 identified by '123456'
-------------------------
本文先对参考的《hadoop开发者》相关文章例子在两个节点的hadoop上实现。
在.12机器上
1.配置默认UTF8编码
在my.cnf中
[mysqld]下添加
character_set_server=utf8(较新版本)
[mysql]下添加
default-character-set=utf8
重启mysql服务
这时可以验证登录mysql后不用先执行set name utf8;可以正常显示中文。
2.在tmp库下创建表,
use tmp;
set names utf8;(可不用,但这样保证些)
DROP TABLE IF EXISTS `tmp`.`teacher`;
CREATE TABLE `tmp`.`teacher` (
`id` int(11) default NULL,
`name` char(20) default NULL,
`age` int(11) default NULL,
`departmentID` int(11) default NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
insert into teacher(id, name, age, departmentID)
values('101', 'tom', '29', '10');
insert into teacher(id, name, age, departmentID)
values('102', 'tony', '39', '20');
insert into teacher(id, name, age, departmentID)
values('103', 'lily', '26', '20');
insert into teacher(id, name, age, departmentID)
values('104', '张三', '25', '30');
3.在~/hadoop-1.0.3下新建mytest目录
mkdir mytest
mkdir mytest/src
mkdir mytest/classes
4.
nano mytest/src/DBAccess2.java
代码:
importjava.io.IOException;importjava.io.IOException;importjava.util.Iterator;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.conf.Configured;importorg.apache.hadoop.fs.Path;importorg.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable; //importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapred.FileInputFormat;importorg.apache.hadoop.mapred.FileOutputFormat;importorg.apache.hadoop.mapred.JobClient;importorg.apache.hadoop.mapred.JobConf;importorg.apache.hadoop.mapred.KeyValueTextInputFormat;import org.apache.hadoop.mapred.TextInputFormat; //importorg.apache.hadoop.mapred.MapReduceBase;importorg.apache.hadoop.mapred.Mapper;importorg.apache.hadoop.mapred.OutputCollector;importorg.apache.hadoop.mapred.Reducer;importorg.apache.hadoop.mapred.Reporter;importorg.apache.hadoop.mapred.TextOutputFormat;import org.apache.hadoop.mapred.FileSplit; //import org.apache.hadoop.mapred.lib.IdentityReducer; //IdentityReducer.class
import org.apache.hadoop.mapred.lib.db.DBConfiguration; //import org.apache.hadoop.mapred.lib.db.*;import org.apache.hadoop.io.*;importjava.io.DataInput;importjava.io.DataOutput;import java.sql.*; //ResultSet, PreparedStatement
importcom.mysql.jdbc.Driver;importorg.apache.hadoop.util.Tool;importorg.apache.hadoop.util.ToolRunner;//importjava.io.BufferedReader;importjava.io.InputStreamReader;importjava.util.ArrayList;importjava.util.Arrays;public class DBAccess2 extends Configured implementsTool {public static class TeacherRecord implements Writable, DBWritable { //add "static" ??
intid;
String name;intage;intdepartmentID;public TeacherRecord(){ //add by brian
System.out.println("TeacherRecord()");
}public TeacherRecord(TeacherRecord t){ //add by brian
System.out.println("TeacherRecord(TeacherRecord t)");this.id =t.id;this.name =t.name;this.age =t.age;this.departmentID =t.departmentID;
}public void readFields(DataInput in) throwsIOException {//TODO Auto-generated method stub
this.id =in.readInt();this.name =Text.readString(in);this.age =in.readInt();this.departmentID =in.readInt();
}public void write(DataOutput out) throwsIOException {
out.writeInt(this.id);
Text.writeString(out,this.name);
out.writeInt(this.age);
out.writeInt(this.departmentID);
}public void readFields(ResultSet result) throwsSQLException {this.id = result.getInt(1);this.name = result.getString(2);this.age = result.getInt(3);this.departmentID = result.getInt(4);
}public void write(PreparedStatement stmt) throwsSQLException{
stmt.setInt(1, this.id);
stmt.setString(2, this.name);
stmt.setInt(3, this.age);
stmt.setInt(4, this.departmentID);
}publicString toString() {return new String(this.name + "," + this.age + "," + this.departmentID); //}
}public static class DBAccessMapper extendsMapReduceBaseimplements Mapper{private final static IntWritable uno = new IntWritable(1);private IntWritable citationCount = newIntWritable();public voidmap(LongWritable key, TeacherRecord value,
OutputCollector collector, Reporter reporter) throwsIOException {
collector.collect(new LongWritable(value.id), newText(value.toString()));
}
}public int run(String[] args) throwsException {Configuration conf=getConf();
JobConf job= new JobConf(conf, DBAccess2.class);
job.setInputFormat(DBInputFormat.class);
FileOutputFormat.setOutputPath(job,new Path("dboutput"));
DBConfiguration.configureDB(job,"com.mysql.jdbc.Driver", "jdbc:mysql://172.19.102.12/tmp", "brian", "123456");
String [] fields= {"id", "name", "age", "departmentID"};
DBInputFormat.setInput(job, TeacherRecord.class, "teacher", null, "id", fields);
job.setMapperClass(DBAccessMapper.class);
job.setReducerClass(IdentityReducer.class);
JobClient.runJob(job);return 0;
}public static void main(String[] args) throwsException {int res = ToolRunner.run(newConfiguration(),newDBAccess2(),
args);
System.exit(res);
}
}
5.将mysql-connector-java-5.1.18-bin.jar上传到hadoop-1.0.3/lib目录下
6.编译及执行命令
javac -classpath hadoop-core-1.0.3.jar:lib/commons-cli-1.2.jar:lib/mysql-connector-java-5.1.18-bin.jar -d mytest/classes/ mytest/src/DBAccess2.java
jar -cvf mytest/DBAccess2.jar -C mytest/classes/ .
bin/hadoop fs -rmr dboutput //输出路径在代码中指定的
time bin/hadoop jar mytest/DBAccess2.jar DBAccess2
但运行时出现下面错误:
huangshaobin@backtest12:~/hadoop-1.0.3$ time bin/hadoop jar mytest/DBAccess2.jar DBAccess212/09/21 17:20:25INFO mapred.JobClient: Running job: job_201209201824_000812/09/21 17:20:26 INFO mapred.JobClient: map 0% reduce 0%
12/09/21 17:20:38INFO mapred.JobClient: Task Id : attempt_201209201824_0008_m_000000_0, Status : FAILED
java.lang.RuntimeException: Errorin configuring objectat org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:93)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:64)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:117)
at org.apache.hadoop.mapred.JobConf.getInputFormat(JobConf.java:575)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.(MapTask.java:197)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:418)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:416)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:616)
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:88)
...11 moreCaused by: java.lang.RuntimeException: java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
at org.apache.hadoop.mapred.lib.db.DBInputFormat.configure(DBInputFormat.java:271)
...16 moreCaused by: java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
at java.net.URLClassLoader$1.run(URLClassLoader.java:217)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:205)
at java.lang.ClassLoader.loadClass(ClassLoader.java:321)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:294)
at java.lang.ClassLoader.loadClass(ClassLoader.java:266)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:186)
at org.apache.hadoop.mapred.lib.db.DBConfiguration.getConnection(DBConfiguration.java:123)
at org.apache.hadoop.mapred.lib.db.DBInputFormat.configure(DBInputFormat.java:266)
...16 more
12/09/21 17:20:41INFO mapred.JobClient: Task Id : attempt_201209201824_0008_m_000001_0, Status : FAILED
java.lang.RuntimeException: Errorin configuring objectat org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:93)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:64)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:117)
at org.apache.hadoop.mapred.JobConf.getInputFormat(JobConf.java:575)
at org.apache.hadoop.mapred.MapTask$TrackedRecordReader.(MapTask.java:197)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:418)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:372)
at org.apache.hadoop.mapred.Child$4.run(Child.java:255)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:396)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1121)
at org.apache.hadoop.mapred.Child.main(Child.java:249)
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.apache.hadoop.util.ReflectionUtils.setJobConf(ReflectionUtils.java:88)
...11 moreCaused by: java.lang.RuntimeException: java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
at org.apache.hadoop.mapred.lib.db.DBInputFormat.configure(DBInputFormat.java:271)
...16 moreCaused by: java.lang.ClassNotFoundException: com.mysql.jdbc.Driver
at java.net.URLClassLoader$1.run(URLClassLoader.java:202)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:190)
at java.lang.ClassLoader.loadClass(ClassLoader.java:307)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:301)
at java.lang.ClassLoader.loadClass(ClassLoader.java:248)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:169)
at org.apache.hadoop.mapred.lib.db.DBConfiguration.getConnection(DBConfiguration.java:123)
at org.apache.hadoop.mapred.lib.db.DBInputFormat.configure(DBInputFormat.java:266)
...16 more
7.将jdbc驱动拷到另外节点:
huangshaobin@backtest12:~/hadoop-1.0.3$ scp lib/mysql-connector-java-5.1.18-bin.jar backtest11:/home/huangshaobin/hadoop-1.0.3/lib
mysql-connector-java-5.1.18-bin.jar 100% 771KB 771.4KB/s 00:00
重新运行:
bin/hadoop fs -rmr dboutput
time bin/hadoop jar mytest/DBAccess2.jar DBAccess2
这时还出现一样的错误,而在伪分布式下是正常的???
将jar拷到另一节点:
huangshaobin@backtest12:~/hadoop-1.0.3$ scp -r mytest/ backtest11:/home/huangshaobin/hadoop-1.0.3
再重新运行:
bin/hadoop fs -rmr dboutput
time bin/hadoop jar mytest/DBAccess2.jar DBAccess2
还是一样错误!难道第一个错是java版本问题?但这里还是有找不到jdbc驱动的错误!
把11上面的lib/下jdbc驱动删了,同时也把mytest/DBAccess2.jar也删了,在代码中JobConf job = new JobConf(conf, DBAccess2.class);后加入以下两句:
job.set("mapred.job.tracker", "172.19.102.12:9001");//
DistributedCache.addFileToClassPath(new Path("/lib/mysql-connector-java-5.1.18-bin.jar"), job); //
并加上import org.apache.hadoop.filecache.DistributedCache;
将jdbc驱动拷到hafs上:
bin/hadoop fs -put lib/mysql-connector-java-5.1.18-bin.jar /lib/mysql-connector-java-5.1.18-bin.jar
重新编译
huangshaobin@backtest12:~/hadoop-1.0.3$ javac -classpath hadoop-core-1.0.3.jar:lib/commons-cli-1.2.jar:lib/mysql-connector-java-5.1.18-bin.jar -d mytest/classes/ mytest/src/DBAccess2.java
Note: mytest/src/DBAccess2.java uses or overrides a deprecated API.
Note: Recompile with -Xlint:deprecation for details.
这两个警告是什么意思?先不理
重新运行,这时候成功了!!
所以,
job.set("mapred.job.tracker", "172.19.102.12:9001");//
DistributedCache.addFileToClassPath(new Path("/lib/mysql-connector-java-5.1.18-bin.jar"), job); //
这两句很重要!!但并不需要将DBAccess2.jar拷到其它节点,估计代码会自动在各个节点之间传输。
代码前面的import也很乱,一并整理后,最后的改后的代码如下:
1 importjava.io.IOException;2 importjava.io.DataInput;3 importjava.io.DataOutput;4 importjava.sql.Connection;5 importjava.sql.DriverManager;6 importjava.sql.PreparedStatement;7 importjava.sql.ResultSet;8 importjava.sql.SQLException;9
10 importorg.apache.hadoop.filecache.DistributedCache;11 importorg.apache.hadoop.fs.Path;12 importorg.apache.hadoop.io.IntWritable;13 importorg.apache.hadoop.io.LongWritable;14 importorg.apache.hadoop.io.Text;15 importorg.apache.hadoop.io.Writable;16 importorg.apache.hadoop.mapred.JobClient;17 importorg.apache.hadoop.mapred.JobConf;18 importorg.apache.hadoop.mapred.MapReduceBase;19 importorg.apache.hadoop.mapred.Mapper;20 importorg.apache.hadoop.mapred.OutputCollector;21 importorg.apache.hadoop.mapred.FileOutputFormat;22 importorg.apache.hadoop.mapred.Reporter;23 importorg.apache.hadoop.mapred.lib.IdentityReducer;24 importorg.apache.hadoop.mapred.lib.db.DBWritable;25 importorg.apache.hadoop.mapred.lib.db.DBInputFormat;26 importorg.apache.hadoop.mapred.lib.db.DBConfiguration;27
28 importorg.apache.hadoop.util.Tool;29 importorg.apache.hadoop.util.ToolRunner;30 importorg.apache.hadoop.conf.Configuration;31 importorg.apache.hadoop.conf.Configured;32
33 public class DBAccess2 extends Configured implementsTool {34
35
36 public static class TeacherRecord implements Writable, DBWritable { //add "static" ??
37 intid;38 String name;39 intage;40 intdepartmentID;41
42 public TeacherRecord(){ //add by brian
43 System.out.println("TeacherRecord()");44 }45
46 public TeacherRecord(TeacherRecord t){ //add by brian
47 System.out.println("TeacherRecord(TeacherRecord t)");48 this.id =t.id;49 this.name =t.name;50 this.age =t.age;51 this.departmentID =t.departmentID;52
53 }54
55 public void readFields(DataInput in) throwsIOException {56 //TODO Auto-generated method stub
57 this.id =in.readInt();58 this.name =Text.readString(in);59 this.age =in.readInt();60 this.departmentID =in.readInt();61 }62
63 public void write(DataOutput out) throwsIOException {64 out.writeInt(this.id);65 Text.writeString(out, this.name);66 out.writeInt(this.age);67 out.writeInt(this.departmentID);68 }69
70 public void readFields(ResultSet result) throwsSQLException {71 this.id = result.getInt(1);72 this.name = result.getString(2);73 this.age = result.getInt(3);74 this.departmentID = result.getInt(4);75 }76
77 public void write(PreparedStatement stmt) throwsSQLException{78 stmt.setInt(1, this.id);79 stmt.setString(2, this.name);80 stmt.setInt(3, this.age);81 stmt.setInt(4, this.departmentID);82 }83
84 publicString toString() {85 return new String(this.name + "," + this.age + "," + this.departmentID); //86 }87
88 }89
90
91 public static class DBAccessMapper extendsMapReduceBase92 implements Mapper{93
94 private final static IntWritable uno = new IntWritable(1);95 private IntWritable citationCount = newIntWritable();96
97 public voidmap(LongWritable key, TeacherRecord value,98 OutputCollector collector, Reporter reporter) throwsIOException {99 collector.collect(new LongWritable(value.id), newText(value.toString()));100 }101 }102
103
104
105 public int run(String[] args) throwsException {106
107 Configuration conf =getConf();108 JobConf job = new JobConf(conf, DBAccess2.class);109 job.set("mapred.job.tracker", "172.19.102.12:9001");//110 DistributedCache.addFileToClassPath(new Path("/lib/mysql-connector-java-5.1.18-bin.jar"), job); //111 job.setInputFormat(DBInputFormat.class);112 FileOutputFormat.setOutputPath(job, new Path("dboutput"));113 DBConfiguration.configureDB(job, "com.mysql.jdbc.Driver", "jdbc:mysql://172.19.102.12/tmp", "brian", "123456");114
115 String [] fields = {"id", "name", "age", "departmentID"};116 DBInputFormat.setInput(job, TeacherRecord.class, "teacher", null, "id", fields);117
118 job.setMapperClass(DBAccessMapper.class);119 job.setReducerClass(IdentityReducer.class);120 JobClient.runJob(job);121
122 return 0;123 }124
125 public static void main(String[] args) throwsException {126 int res = ToolRunner.run(newConfiguration(),127 newDBAccess2(),128 args);129
130 System.exit(res);131 }132 }
输出的结果是:
huangshaobin@backtest12:~/hadoop-1.0.3$ bin/hadoop fs -cat dboutput/*
101 tom,29,10
102 tony,39,20
103 lily,26,20
104 张三,25,30
总结步骤:
1.在代码中指定jobtracker及额外的jdbc包jar
job.set("mapred.job.tracker", "172.19.102.12:9001");
DistributedCache.addFileToClassPath(new Path("/lib/mysql-connector-java-5.1.18-bin.jar"), job);
2.编译通过
3.将额外的jdbc包传到hdfs上/lib目录下bin/hadoop fs -put lib/mysql-connector-java-5.1.18-bin.jar /lib/mysql-connector-java-5.1.18-bin.jar
4.运行成功
另外,可在http://hadoop.apache.org/docs/r1.0.3/api/index.html 上查看 class DistributedCache,进一步了解其它方法用法,这里也有例子说明。
参考:
《hadoop开发者第二期:访问数据库》