Bootstrap

POI-SAX解析EXCEL文件,动态解析,解析指定sheet

最近做一个关于excel解析的功能,使用POI工具进行excel解析,发现使用网上的很多相同的解决方案都是如下基于XSSFWorkbook进行解析,我尝试了下单文件,小文件的确没有什么问题,解析也比较直观方便,但是后续上了测试后发现,几百个Excel每个平均3兆左右,没执行几个就资源占满,服务宕机了,后来看jvs发现内存吃满了,在其进行解析的时候,为了返回给我们dom树结构,会为每个单元格创建对象,解析过程中产生大量的对象,即便我只解析第一个sheet,他也会全量的进行预处理,算是用空间换时间,大概实现如下

public Map<String, Object> readExcel(String filePath) {
        Workbook workbook = null;
        try {
            String fileType = filePath.substring(filePath.lastIndexOf("."));
            workbook = PoiExcelUtil.getWorkbook(filePath, fileType);
            if (workbook == null) {
                logger.info("获取workbook对象失败");
                return null;
            }
            //解析workbook
            return analysisExcel(workbook);
        } catch (Exception e) {
            logger.error("读取Excel文件失败" + filePath + "错误信息", e);
            return null;
        } finally {
            try {
                if (null != workbook) {
                    workbook.close();
                }
            } catch (Exception e) {
                logger.error("关闭数据流出错!错误信息:", e);
                return null;
            }

        }
    }
public static Workbook getWorkbook(String filePath, String fileType) {
        Workbook workbook = null;
        FileInputStream fileInputStream = null;
        try {
            File excelFile = new File(filePath);
            if (!excelFile.exists()) {
                logger.info(filePath + "文件不存在");
                return null;
            }
            fileInputStream = new FileInputStream(excelFile);
            if (fileType.equalsIgnoreCase(XLS)) {
                workbook = new HSSFWorkbook(fileInputStream);
            } else if (fileType.equalsIgnoreCase(XLSX)) {
                workbook = new XSSFWorkbook(fileInputStream);
            }
        } catch (Exception e) {
            logger.error("获取文件失败", e);
        } finally {
            try {
                if (null != fileInputStream) {
                    fileInputStream.close();
                }
            } catch (Exception e) {
                logger.error("关闭数据流出错!错误信息:", e);
                return null;
            }
        }
        return workbook;
    }
//该方法为伪代码,大致写出了取表头和表数据逻辑
public Map<String, Object> analysisExcel(Workbook workbook) {
        Map<String, Object> map = new HashMap<>();
        logger.debug("-------------{} start......------------------",                     
        workbook.getSheetAt(0).getSheetName());
        //firstRowCount首行行号
        Row firstRow = sheet.getRow(firstRowCount);
        int cellCount = firstRow.getLastCellNum();//取列
        for (int i1 = 0; i1 < cellCount; i1++) {
            System.out.println(firstRow.getCell(i1).toString());//表头单元格
        }
        //rowStart rowEnd根据逻辑获取
        int rowStart = firstRowCount+1;
        int rowEnd = sheet.getPhysicalNumberOfRows();
        for (int j = rowStart; j < rowEnd; j++) {
            Row row = sheet.getRow(j);//获取对应的row对象
            if (row == null){
                break;
            }
            //将每一行数据转化为一个Map对象
            Map<String, Object> rowMap = PoiExcelUtil.convertRowToData(row, cellCount, mapKeys, useCellNums, sheetCondition.getKeyType());
            dataList.add(rowMap);
        }
        return map;
    }

以上写法最终效率不太高,并且进行内存调优和poi接口切换尝试后性能都一样不太高,后来调研发现是他会帮我们生成dom树的原因,而我想只解析前两个或者1个sheet的时候,他也会帮我把所有数据生成对象加载到内存,所以我切换成基于SAX方式。

在excel解析的时候,采用SAX方方式会将excel转换为xml进行解析避免了内存溢出。我这边测试300多百个几兆的文件只解析前两个sheet只需要一份多钟也不会造成内存溢出。

具体做法如下:

<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>5.2.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.2.3</version>
        </dependency>

调用,测试入口 

import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.json.JSONUtil; 
import  ExcelReadDataDelegated;
import  ExcelXlsxReaderWithDefaultHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.ClassPathResource;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ReadExcel {

    private static final Logger logger = LoggerFactory.getLogger(ReadExcel.class);

    /**
     * 执行excel读取
     *
     * @param inputStream
     * @param excelReadDataDelegated
     * @throws Exception
     */
    public static void readExcel(InputStream inputStream, ExcelReadDataDelegated excelReadDataDelegated) throws Exception {
        ExcelXlsxReaderWithDefaultHandler excelXlsxReader = new ExcelXlsxReaderWithDefaultHandler(excelReadDataDelegated);
        excelXlsxReader.process(inputStream);
    }

    /**
     * 定制化读取每个sheet返回的数据并处理
     *
     * @param inputStream
     * @return
     */
    public static Map<String, Object> startReadExcel(InputStream inputStream) {
        Map<String, Object> mapRes = new HashMap<>();
        List<Map<String, String>> listRes = new ArrayList<>(); 
        try {
            readExcel(inputStream, (sheetIndex, totalRowCount, curRow, cellList) -> {
                //这里sheetIndex对应具体的sheet内容
                logger.info(JSONUtil.toJsonStr(cellList));
            });
        } catch (Exception e) {
            logger.info("解析失败:{}", inputStream, e);
            return null;
        }
        if (CollectionUtil.isEmpty(listRes)) {
            logger.info("解析失败,无模型数据:{}", inputStream);
            return null;
        }
        if (CollectionUtil.isNotEmpty(codeTables)) {
            //码表数据获取前置处理
        }
        return mapRes;
    }
    //文件可以从本地也可以从项目resource 目录下,我的是resource/data目录下的info.xlsx
    public static void main(String[] args) throws Exception {
        ClassPathResource resource = new ClassPathResource("/data/info.xlsx");
        startReadExcel(resource.getInputStream());
    }
}
//委托读取excel数据委托接口
public interface ExcelReadDataDelegated {

    /**
     * 每获取一条记录,即写数据
     * 在flume里每获取一条记录即写,而不必缓存起来,可以大大减少内存的消耗,这里主要是针对flume读取大数据量excel来说的
     *
     * @param sheetIndex    sheet位置
     * @param totalRowCount 该sheet总行数
     * @param curRow        行号
     * @param cellList      行数据
     */
    public abstract void readExcelDate(int sheetIndex, int totalRowCount, int curRow, List<String> cellList);

}

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

/**
 * @author quyl
 * @date 2023/03/05
 * @description 读取EXCEL辅助类
 */
public class ExcelXlsxReaderWithDefaultHandler extends DefaultHandler {

    private ExcelReadDataDelegated excelReadDataDelegated;

    public ExcelReadDataDelegated getExcelReadDataDelegated() {
        return excelReadDataDelegated;
    }

    public void setExcelReadDataDelegated(ExcelReadDataDelegated excelReadDataDelegated) {
        this.excelReadDataDelegated = excelReadDataDelegated;
    }

    public ExcelXlsxReaderWithDefaultHandler(ExcelReadDataDelegated excelReadDataDelegated) {
        this.excelReadDataDelegated = excelReadDataDelegated;
    }

    /**
     * 单元格中的数据可能的数据类型
     */
    enum CellDataType {
        BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
    }

    /**
     * 指定读取sheet页
     */
    public final static Integer SHEET_NUM = 3;

    /**
     * 共享字符串表
     */
    private SharedStringsTable sst;

    /**
     * 上一次的索引值
     */
    private String lastIndex;

    /**
     * 文件的绝对路径
     */
    private String filePath = "";

    /**
     * 工作表索引
     */
    private int sheetIndex = 0;

    /**
     * sheet名
     */
    private String sheetName = "";

    /**
     * 总行数
     */
    private int totalRows = 0;

    private List<String> titlelist = new ArrayList<String>();
    /**
     * 一行内cell集合
     */
    private List<String> cellList = new ArrayList<String>();

    /**
     * 当前行
     */
    private int curRow = 1;

    /**
     * 当前列
     */
    private int curCol = 0;


    /**
     * 单元格数据类型,默认为字符串类型
     */
    private CellDataType nextDataType = CellDataType.SSTINDEX;

    private final DataFormatter formatter = new DataFormatter();

    /**
     * 单元格日期格式的索引
     */
    private short formatIndex;

    /**
     * 日期格式字符串
     */
    private String formatString;

    //定义该文档一行最大的单元格数,用来补全一行最后可能缺失的单元格
    private String maxRef = null;

    /**
     * 单元格
     */
    private StylesTable stylesTable;

    private boolean nextIsString;
    private int preCol = 0; //上一列列索引
    private int titleRow = 1; //标题行,一般情况下为0
    private int rowsize = 0; //列数

    /**
     * 总行号
     */
    private Integer totalRowCount;


    /**
     * 遍历工作簿中所有的电子表格
     * 并缓存在mySheetList中
     *
     * @param filename
     * @throws Exception
     */
    public int process(String filename) throws Exception {
        filePath = filename;
        OPCPackage pkg = OPCPackage.open(filename);
        doSheet(pkg);
        //关闭并不保存
        pkg.revert();//pkg.close();关闭保存文件(这样md5就不一致了)
        return totalRows; //返回该excel文件的总行数,不包括首列和空行
    }

    public int process(InputStream inputStream) throws Exception {
        OPCPackage pkg = OPCPackage.open(inputStream);
        doSheet(pkg);
        //关闭并不保存
        pkg.revert();
        return totalRows; //返回该excel文件的总行数,不包括首列和空行
    }

    public void doSheet(OPCPackage pkg) throws Exception {
        XSSFReader xssfReader = new XSSFReader(pkg);
        stylesTable = xssfReader.getStylesTable();
        SharedStringsTable sst = (SharedStringsTable) xssfReader.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
        while (sheets.hasNext() && sheetIndex < SHEET_NUM) { //遍历sheet
            curRow = 1; //标记初始行为第一行
            sheetIndex++;
            InputStream sheet = sheets.next(); //sheets.next()和sheets.getSheetName()不能换位置,否则sheetName报错
            sheetName = sheets.getSheetName();
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource); //解析excel的每条记录,在这个过程中startElement()、characters()、endElement()这三个函数会依次执行
            sheet.close();
        }
        sst.close();
    }

    /**
     * 解析成xml
     *
     * @param sst
     * @return
     * @throws SAXException
     */
    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
        XMLReader parser = XMLReaderFactory.createXMLReader();
        this.sst = sst;
        parser.setContentHandler(this);
        return parser;
    }

    /**
     * 第一个执行
     *
     * @param uri        命名空间 URI,如果元素没有命名空间 URI 或未执行命名空间处理,则为空字符串。
     * @param localName  本地名称(不带前缀)或空字符串(如果未执行命名空间处理)。
     * @param name       限定名称(带前缀)或空字符串(如果限定名称不可用)。
     * @param attributes 附加到元素的属性。如果没有属性,则为空属性对象
     * @throws SAXException
     */
    @Override
    public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
        // 获取总行号  格式: A1:B5    取最后一个值即可
        if ("dimension".equals(name)) {
            String dimensionStr = attributes.getValue("ref");
            totalRowCount = totalCount(dimensionStr) - 1;
        }
        // c => 单元格
        if (name.equals("c")) {
            // 如果下一个元素是 SST 的索引,则将nextIsString标记为true
            String cellType = attributes.getValue("t");
            String rowStr = attributes.getValue("r");
            curCol = this.getRowIndex(rowStr);
            if (cellType != null && cellType.equals("s")) {
                nextIsString = true;
            } else {
                nextIsString = false;
            }
        }
        // 置空
        lastIndex = "";
    }


    /**
     * 第二个执行
     * 得到单元格对应的索引值或是内容值
     * 如果单元格类型是字符串、INLINESTR、数字、日期,lastIndex则是索引值
     * 如果单元格类型是布尔值、错误、公式,lastIndex则是内容值
     *
     * @param ch
     * @param start
     * @param length
     * @throws SAXException
     */
    @Override
    public void characters(char[] ch, int start, int length) throws SAXException {
        lastIndex += new String(ch, start, length);
    }


    /**
     * 第三个执行
     *
     * @param uri
     * @param localName
     * @param name
     * @throws SAXException
     */
    @Override
    public void endElement(String uri, String localName, String name) throws SAXException {
        // 根据SST的索引值的到单元格的真正要存储的字符串
        // 这时characters()方法可能会被调用多次
        if (nextIsString) {
            try {
                int idx = Integer.parseInt(lastIndex);
                lastIndex = new XSSFRichTextString(sst.getItemAt(idx).getString()).toString();
            } catch (Exception e) {
            }
        }
        // v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
        // 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
        if (name.equals("v")) {
            String value = lastIndex.trim();
            value = value.equals("") ? "" : value;
            int cols = curCol - preCol;
            if (cols > 1) {
                for (int i = 0; i < cols - 1; i++) {
                    cellList.add(preCol, "");
                }
            }
            preCol = curCol;
            cellList.add(curCol - 1, value);
        } else {
            rowlistLast(name);
        }
    }

    private void rowlistLast(String name) {
        //如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
        if (name.equals("row")) {
            int tmpCols = cellList.size();
            if (curRow > this.titleRow && tmpCols < this.rowsize) {
                for (int i = 0; i < this.rowsize - tmpCols; i++) {
                    cellList.add(cellList.size(), "");
                }
            }
            if (curRow == 1) {
                titlelist = cellList;
                excelReadDataDelegated.readExcelDate(sheetIndex, totalRowCount, curRow, cellList);
            } else if (curRow >= 1) {
                //optRows(sheetIndex, curRow, cellList);
//                System.out.println(JSONUtil.toJsonStr(cellList));
                excelReadDataDelegated.readExcelDate(sheetIndex, totalRowCount, curRow, cellList);
            }
            if (curRow == this.titleRow) {
                this.rowsize = cellList.size() + 7;
            }
            cellList = new ArrayList<>();
            curRow++;
            curCol = 0;
            preCol = 0;
        }
    }


    /**
     * 处理数据类型
     *
     * @param attributes
     */
    public void setNextDataType(Attributes attributes) {
        nextDataType = CellDataType.NUMBER; //cellType为空,则表示该单元格类型为数字
        formatIndex = -1;
        formatString = null;
        String cellType = attributes.getValue("t"); //单元格类型
        String cellStyleStr = attributes.getValue("s"); //
        String columnData = attributes.getValue("r"); //获取单元格的位置,如A1,B1

        if ("b".equals(cellType)) { //处理布尔值
            nextDataType = CellDataType.BOOL;
        } else if ("e".equals(cellType)) {  //处理错误
            nextDataType = CellDataType.ERROR;
        } else if ("inlineStr".equals(cellType)) {
            nextDataType = CellDataType.INLINESTR;
        } else if ("s".equals(cellType)) { //处理字符串
            nextDataType = CellDataType.SSTINDEX;
        } else if ("str".equals(cellType)) {
            nextDataType = CellDataType.FORMULA;
        }

        if (cellStyleStr != null) { //处理日期
            int styleIndex = Integer.parseInt(cellStyleStr);
            XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
            formatIndex = style.getDataFormat();
            formatString = style.getDataFormatString();
            if (formatString.contains("m/d/yy") || formatString.contains("yyyy/mm/dd") || formatString.contains("yyyy/m/d")) {
                nextDataType = CellDataType.DATE;
                formatString = "yyyy-MM-dd hh:mm:ss";
            }

            if (formatString == null) {
                nextDataType = CellDataType.NULL;
                formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
            }
        }
    }

    /**
     * 对解析出来的数据进行类型处理
     *
     * @param value   单元格的值,
     *                value代表解析:BOOL的为0或1, ERROR的为内容值,FORMULA的为内容值,INLINESTR的为索引值需转换为内容值,
     *                SSTINDEX的为索引值需转换为内容值, NUMBER为内容值,DATE为内容值
     * @param thisStr 一个空字符串
     * @return
     */
    @SuppressWarnings("deprecation")
    public String getDataValue(String value, String thisStr) {
        switch (nextDataType) {
            // 这几个的顺序不能随便交换,交换了很可能会导致数据错误
            case BOOL: //布尔值
                char first = value.charAt(0);
                thisStr = first == '0' ? "FALSE" : "TRUE";
                break;
            case ERROR: //错误
                thisStr = "\"ERROR:" + value.toString() + '"';
                break;
            case FORMULA: //公式
                thisStr = '"' + value.toString() + '"';
                break;
            case INLINESTR:
                XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
                thisStr = rtsi.toString();
                rtsi = null;
                break;
            case SSTINDEX: //字符串
                String sstIndex = value.toString();
                try {
                    int idx = Integer.parseInt(sstIndex);
                    XSSFRichTextString rtss = new XSSFRichTextString(sst.getItemAt(idx).getString());//根据idx索引值获取内容值
                    thisStr = rtss.toString();
                    rtss = null;
                } catch (NumberFormatException ex) {
                    thisStr = value.toString();
                }
                break;
            case NUMBER: //数字
                if (formatString != null) {
                    thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
                } else {
                    thisStr = value;
                }
                thisStr = thisStr.replace("_", "").trim();
                break;
            case DATE: //日期
                thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
                // 对日期字符串作特殊处理,去掉T
                thisStr = thisStr.replace("T", " ");
                break;
            default:
                thisStr = " ";
                break;
        }
        return thisStr;
    }


    public int countNullCell(String ref, String preRef) {
        //excel2007最大行数是1048576,最大列数是16384,最后一列列名是XFD
        String xfd = ref.replaceAll("\\d+", "");
        String xfd_1 = preRef.replaceAll("\\d+", "");

        xfd = fillChar(xfd, 3, '@', true);
        xfd_1 = fillChar(xfd_1, 3, '@', true);

        char[] letter = xfd.toCharArray();
        char[] letter_1 = xfd_1.toCharArray();
        int res = (letter[0] - letter_1[0]) * 26 * 26 + (letter[1] - letter_1[1]) * 26 + (letter[2] - letter_1[2]);
        return res - 1;
    }

    public String fillChar(String str, int len, char let, boolean isPre) {
        int len_1 = str.length();
        if (len_1 < len) {
            if (isPre) {
                for (int i = 0; i < (len - len_1); i++) {
                    str = let + str;
                }
            } else {
                for (int i = 0; i < (len - len_1); i++) {
                    str = str + let;
                }
            }
        }
        return str;
    }

    public Integer totalCount(String var) {
        char[] chars = var.toCharArray();
        String pat = "[A-Z]";
        String res = "";
        int length = chars.length;
        for (int i = 1; i <= length; i++) {
            if (String.valueOf(chars[length - i]).matches(pat)) {
                break;
            }
            res = chars[length - i] + res;
        }
        if ("".equals(res)) {
            return 0;
        }
        return Integer.parseInt(res);
    }


    //得到列索引,每一列c元素的r属性构成为字母加数字的形式,字母组合为列索引,数字组合为行索引,
    //如AB45,表示为第(A-A+1)*26+(B-A+1)*26列,45行
    public int getRowIndex(String rowStr) {
        rowStr = rowStr.replaceAll("[^A-Z]", "");
        byte[] rowAbc = rowStr.getBytes();
        int len = rowAbc.length;
        float num = 0;
        for (int i = 0; i < len; i++) {
            num += (rowAbc[i] - 'A' + 1) * Math.pow(26, len - i - 1);
        }
        return (int) num;
    }

}

调用,测试入口 处替换自己的文件就可以测试了,本文给出了工具类和思路,具体定制化根据返回结果自由处理,记录下。

;