Bootstrap

pyspark实战(六)pyspark+happybase批量写入hbase操作

pyspark和happyhase操作hbase需要提前部署和安装pyspark和happyhbase的python包,具体的安装过程可看靠前面章节,这里不再赘述。

1、引入相关包

from pyspark import SparkContext,SparkConf #pyspark包,v2.2.0
import happybase                           #happyhbase包,v1.2.0
import numpy as np

2、创建hbase操作类

class Happybase_ope:
    #初始化happybase对象,默认连接本地的hbase,table_prefix是工作空间
    def __init__(self,host='localhost',port=9090,timeout=None,table_prefix=None):
        self.connection=happybase.Connection(host=host,port=port,timeout=timeout,table_prefix=table_prefix)

    def createTable(self,tablename,families):
        self.connection.create_table(tablename,families)

    # families格式
    # families = {
    #     'cf1': dict(max_versions=10),
    #     'cf2': dict(max_versions=1, block_cache_enabled=False),
    #     'cf3': dict(),  # use defaults
    # }

    def table(self,tablename):
        return self.connection.table(tablename)


    def close(self):
        self.connection.close()

3、创建带有压缩属性的表

    hpbase=Happybase_ope()
    tbname='tb'

    创建表
    families = {
             'cf1': dict(max_versions=366,compression='SNAPPY')
         }
    hpbase.createTable(tbname,families)

压缩方式为:snappy,版本数量为366.

4、使用put写入一条记录

    hpbase=Happybase_ope()
    tbname='tb2'

    table=hpbase.table(tbname)
    data={"cf1:name":'zhangsan',"cf1:age":"18"}
    table.put(row='20180113',data=data)#默认当前时间戳

5、批量写入数据

批量写入数据需要使用happybase的batch方法。代码如下:

#arr是一个数组
def WriteArr(arr):
    hpbase=Happybase_ope()
    tbname='tb2'

    table=hpbase.table(tbname)
    bat = table.batch()

    #写入过程中以二维数组的行列号作为主键
    for i in range(0,len(num[0])):
        bat.put('{0}-{1}'.format(num[1],i), {'cf1:value': num[0][i]})
        # print('{0}-{1}'.format(num[1],i))
        # print(num[0][i])
    bat.send()
    hpbase.close()

引入numpy,并使用numpy创建2000*5000的一个二维数组。使用pyspark将该二维数组转换为rdd,代码如下:

if __name__ == '__main__':
    conf=SparkConf()
    conf.setAppName("hpbase")
    #conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)

    lenx=2000
    leny=5000
    nums = np.ones((lenx, leny),dtype=np.str) #所有数组默认值为1,需要转化为字符串
    rddnums=sc.parallelize(nums)#创建rdd
    #使用zipWithIndex给每个rdd标识索引,这里每个元素是一个数组,使用foreach遍历每个rdd并写入hbase
    rddnums.zipWithIndex().repartition(100).foreach(lambda zipArr:WriteArr(zipArr))
    #rdd.foreach(lambda num:printNum(num))

全部代码如下:
 

#!usr/bin/python
# -*- coding: utf-8 -*-
from pyspark import SparkContext,SparkConf
import happybase
import time
import numpy as np

class Happybase_ope:
    def __init__(self,host='localhost',port=9090,timeout=None,table_prefix=None):
        self.connection=happybase.Connection(host=host,port=port,timeout=timeout,table_prefix=table_prefix)

    def createTable(self,tablename,families):
        self.connection.create_table(tablename,families)


    def table(self,tablename):
        return self.connection.table(tablename)


    def close(self):
        self.connection.close()

def WriteArr(num):
    hpbase=Happybase_ope()
    tbname='tb2'

    table=hpbase.table(tbname)
    bat = table.batch()

    for i in range(0,len(num[0])):
        bat.put('{0}-{1}'.format(num[1],i), {'info:value': num[0][i]})
        # print('{0}-{1}'.format(num[1],i))
        # print(num[0][i])
    bat.send()
    hpbase.close()
if __name__ == '__main__':
    conf=SparkConf()
    conf.setAppName("hpbase")
    #conf.setMaster("local[*]")
    sc = SparkContext(conf=conf)

    lenx=2000
    leny=5000
    nums = np.ones((lenx, leny),dtype=np.str)
    # nums=np.arange(18).reshape(3,6)
    rddnums=sc.parallelize(nums)
    rddnums.zipWithIndex().repartition(100).foreach(lambda zipArr:WriteArr(zipArr))
    #rdd.foreach(lambda num:printNum(num))

 

;