pyspark和happyhase操作hbase需要提前部署和安装pyspark和happyhbase的python包,具体的安装过程可看靠前面章节,这里不再赘述。
1、引入相关包
from pyspark import SparkContext,SparkConf #pyspark包,v2.2.0
import happybase #happyhbase包,v1.2.0
import numpy as np
2、创建hbase操作类
class Happybase_ope:
#初始化happybase对象,默认连接本地的hbase,table_prefix是工作空间
def __init__(self,host='localhost',port=9090,timeout=None,table_prefix=None):
self.connection=happybase.Connection(host=host,port=port,timeout=timeout,table_prefix=table_prefix)
def createTable(self,tablename,families):
self.connection.create_table(tablename,families)
# families格式
# families = {
# 'cf1': dict(max_versions=10),
# 'cf2': dict(max_versions=1, block_cache_enabled=False),
# 'cf3': dict(), # use defaults
# }
def table(self,tablename):
return self.connection.table(tablename)
def close(self):
self.connection.close()
3、创建带有压缩属性的表
hpbase=Happybase_ope()
tbname='tb'
创建表
families = {
'cf1': dict(max_versions=366,compression='SNAPPY')
}
hpbase.createTable(tbname,families)
压缩方式为:snappy,版本数量为366.
4、使用put写入一条记录
hpbase=Happybase_ope()
tbname='tb2'
table=hpbase.table(tbname)
data={"cf1:name":'zhangsan',"cf1:age":"18"}
table.put(row='20180113',data=data)#默认当前时间戳
5、批量写入数据
批量写入数据需要使用happybase的batch方法。代码如下:
#arr是一个数组
def WriteArr(arr):
hpbase=Happybase_ope()
tbname='tb2'
table=hpbase.table(tbname)
bat = table.batch()
#写入过程中以二维数组的行列号作为主键
for i in range(0,len(num[0])):
bat.put('{0}-{1}'.format(num[1],i), {'cf1:value': num[0][i]})
# print('{0}-{1}'.format(num[1],i))
# print(num[0][i])
bat.send()
hpbase.close()
引入numpy,并使用numpy创建2000*5000的一个二维数组。使用pyspark将该二维数组转换为rdd,代码如下:
if __name__ == '__main__':
conf=SparkConf()
conf.setAppName("hpbase")
#conf.setMaster("local[*]")
sc = SparkContext(conf=conf)
lenx=2000
leny=5000
nums = np.ones((lenx, leny),dtype=np.str) #所有数组默认值为1,需要转化为字符串
rddnums=sc.parallelize(nums)#创建rdd
#使用zipWithIndex给每个rdd标识索引,这里每个元素是一个数组,使用foreach遍历每个rdd并写入hbase
rddnums.zipWithIndex().repartition(100).foreach(lambda zipArr:WriteArr(zipArr))
#rdd.foreach(lambda num:printNum(num))
全部代码如下:
#!usr/bin/python
# -*- coding: utf-8 -*-
from pyspark import SparkContext,SparkConf
import happybase
import time
import numpy as np
class Happybase_ope:
def __init__(self,host='localhost',port=9090,timeout=None,table_prefix=None):
self.connection=happybase.Connection(host=host,port=port,timeout=timeout,table_prefix=table_prefix)
def createTable(self,tablename,families):
self.connection.create_table(tablename,families)
def table(self,tablename):
return self.connection.table(tablename)
def close(self):
self.connection.close()
def WriteArr(num):
hpbase=Happybase_ope()
tbname='tb2'
table=hpbase.table(tbname)
bat = table.batch()
for i in range(0,len(num[0])):
bat.put('{0}-{1}'.format(num[1],i), {'info:value': num[0][i]})
# print('{0}-{1}'.format(num[1],i))
# print(num[0][i])
bat.send()
hpbase.close()
if __name__ == '__main__':
conf=SparkConf()
conf.setAppName("hpbase")
#conf.setMaster("local[*]")
sc = SparkContext(conf=conf)
lenx=2000
leny=5000
nums = np.ones((lenx, leny),dtype=np.str)
# nums=np.arange(18).reshape(3,6)
rddnums=sc.parallelize(nums)
rddnums.zipWithIndex().repartition(100).foreach(lambda zipArr:WriteArr(zipArr))
#rdd.foreach(lambda num:printNum(num))