最近做一个关于excel解析的功能,使用POI工具进行excel解析,发现使用网上的很多相同的解决方案都是如下基于XSSFWorkbook进行解析,我尝试了下单文件,小文件的确没有什么问题,解析也比较直观方便,但是后续上了测试后发现,几百个Excel每个平均3兆左右,没执行几个就资源占满,服务宕机了,后来看jvs发现内存吃满了,在其进行解析的时候,为了返回给我们dom树结构,会为每个单元格创建对象,解析过程中产生大量的对象,即便我只解析第一个sheet,他也会全量的进行预处理,算是用空间换时间,大概实现如下
public Map<String, Object> readExcel(String filePath) {
Workbook workbook = null;
try {
String fileType = filePath.substring(filePath.lastIndexOf("."));
workbook = PoiExcelUtil.getWorkbook(filePath, fileType);
if (workbook == null) {
logger.info("获取workbook对象失败");
return null;
}
//解析workbook
return analysisExcel(workbook);
} catch (Exception e) {
logger.error("读取Excel文件失败" + filePath + "错误信息", e);
return null;
} finally {
try {
if (null != workbook) {
workbook.close();
}
} catch (Exception e) {
logger.error("关闭数据流出错!错误信息:", e);
return null;
}
}
}
public static Workbook getWorkbook(String filePath, String fileType) {
Workbook workbook = null;
FileInputStream fileInputStream = null;
try {
File excelFile = new File(filePath);
if (!excelFile.exists()) {
logger.info(filePath + "文件不存在");
return null;
}
fileInputStream = new FileInputStream(excelFile);
if (fileType.equalsIgnoreCase(XLS)) {
workbook = new HSSFWorkbook(fileInputStream);
} else if (fileType.equalsIgnoreCase(XLSX)) {
workbook = new XSSFWorkbook(fileInputStream);
}
} catch (Exception e) {
logger.error("获取文件失败", e);
} finally {
try {
if (null != fileInputStream) {
fileInputStream.close();
}
} catch (Exception e) {
logger.error("关闭数据流出错!错误信息:", e);
return null;
}
}
return workbook;
}
//该方法为伪代码,大致写出了取表头和表数据逻辑
public Map<String, Object> analysisExcel(Workbook workbook) {
Map<String, Object> map = new HashMap<>();
logger.debug("-------------{} start......------------------",
workbook.getSheetAt(0).getSheetName());
//firstRowCount首行行号
Row firstRow = sheet.getRow(firstRowCount);
int cellCount = firstRow.getLastCellNum();//取列
for (int i1 = 0; i1 < cellCount; i1++) {
System.out.println(firstRow.getCell(i1).toString());//表头单元格
}
//rowStart rowEnd根据逻辑获取
int rowStart = firstRowCount+1;
int rowEnd = sheet.getPhysicalNumberOfRows();
for (int j = rowStart; j < rowEnd; j++) {
Row row = sheet.getRow(j);//获取对应的row对象
if (row == null){
break;
}
//将每一行数据转化为一个Map对象
Map<String, Object> rowMap = PoiExcelUtil.convertRowToData(row, cellCount, mapKeys, useCellNums, sheetCondition.getKeyType());
dataList.add(rowMap);
}
return map;
}
以上写法最终效率不太高,并且进行内存调优和poi接口切换尝试后性能都一样不太高,后来调研发现是他会帮我们生成dom树的原因,而我想只解析前两个或者1个sheet的时候,他也会帮我把所有数据生成对象加载到内存,所以我切换成基于SAX方式。
在excel解析的时候,采用SAX方方式会将excel转换为xml进行解析避免了内存溢出。我这边测试300多百个几兆的文件只解析前两个sheet只需要一份多钟也不会造成内存溢出。
具体做法如下:
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
调用,测试入口
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.json.JSONUtil;
import ExcelReadDataDelegated;
import ExcelXlsxReaderWithDefaultHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.core.io.ClassPathResource;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ReadExcel {
private static final Logger logger = LoggerFactory.getLogger(ReadExcel.class);
/**
* 执行excel读取
*
* @param inputStream
* @param excelReadDataDelegated
* @throws Exception
*/
public static void readExcel(InputStream inputStream, ExcelReadDataDelegated excelReadDataDelegated) throws Exception {
ExcelXlsxReaderWithDefaultHandler excelXlsxReader = new ExcelXlsxReaderWithDefaultHandler(excelReadDataDelegated);
excelXlsxReader.process(inputStream);
}
/**
* 定制化读取每个sheet返回的数据并处理
*
* @param inputStream
* @return
*/
public static Map<String, Object> startReadExcel(InputStream inputStream) {
Map<String, Object> mapRes = new HashMap<>();
List<Map<String, String>> listRes = new ArrayList<>();
try {
readExcel(inputStream, (sheetIndex, totalRowCount, curRow, cellList) -> {
//这里sheetIndex对应具体的sheet内容
logger.info(JSONUtil.toJsonStr(cellList));
});
} catch (Exception e) {
logger.info("解析失败:{}", inputStream, e);
return null;
}
if (CollectionUtil.isEmpty(listRes)) {
logger.info("解析失败,无模型数据:{}", inputStream);
return null;
}
if (CollectionUtil.isNotEmpty(codeTables)) {
//码表数据获取前置处理
}
return mapRes;
}
//文件可以从本地也可以从项目resource 目录下,我的是resource/data目录下的info.xlsx
public static void main(String[] args) throws Exception {
ClassPathResource resource = new ClassPathResource("/data/info.xlsx");
startReadExcel(resource.getInputStream());
}
}
//委托读取excel数据委托接口
public interface ExcelReadDataDelegated {
/**
* 每获取一条记录,即写数据
* 在flume里每获取一条记录即写,而不必缓存起来,可以大大减少内存的消耗,这里主要是针对flume读取大数据量excel来说的
*
* @param sheetIndex sheet位置
* @param totalRowCount 该sheet总行数
* @param curRow 行号
* @param cellList 行数据
*/
public abstract void readExcelDate(int sheetIndex, int totalRowCount, int curRow, List<String> cellList);
}
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
/**
* @author quyl
* @date 2023/03/05
* @description 读取EXCEL辅助类
*/
public class ExcelXlsxReaderWithDefaultHandler extends DefaultHandler {
private ExcelReadDataDelegated excelReadDataDelegated;
public ExcelReadDataDelegated getExcelReadDataDelegated() {
return excelReadDataDelegated;
}
public void setExcelReadDataDelegated(ExcelReadDataDelegated excelReadDataDelegated) {
this.excelReadDataDelegated = excelReadDataDelegated;
}
public ExcelXlsxReaderWithDefaultHandler(ExcelReadDataDelegated excelReadDataDelegated) {
this.excelReadDataDelegated = excelReadDataDelegated;
}
/**
* 单元格中的数据可能的数据类型
*/
enum CellDataType {
BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, DATE, NULL
}
/**
* 指定读取sheet页
*/
public final static Integer SHEET_NUM = 3;
/**
* 共享字符串表
*/
private SharedStringsTable sst;
/**
* 上一次的索引值
*/
private String lastIndex;
/**
* 文件的绝对路径
*/
private String filePath = "";
/**
* 工作表索引
*/
private int sheetIndex = 0;
/**
* sheet名
*/
private String sheetName = "";
/**
* 总行数
*/
private int totalRows = 0;
private List<String> titlelist = new ArrayList<String>();
/**
* 一行内cell集合
*/
private List<String> cellList = new ArrayList<String>();
/**
* 当前行
*/
private int curRow = 1;
/**
* 当前列
*/
private int curCol = 0;
/**
* 单元格数据类型,默认为字符串类型
*/
private CellDataType nextDataType = CellDataType.SSTINDEX;
private final DataFormatter formatter = new DataFormatter();
/**
* 单元格日期格式的索引
*/
private short formatIndex;
/**
* 日期格式字符串
*/
private String formatString;
//定义该文档一行最大的单元格数,用来补全一行最后可能缺失的单元格
private String maxRef = null;
/**
* 单元格
*/
private StylesTable stylesTable;
private boolean nextIsString;
private int preCol = 0; //上一列列索引
private int titleRow = 1; //标题行,一般情况下为0
private int rowsize = 0; //列数
/**
* 总行号
*/
private Integer totalRowCount;
/**
* 遍历工作簿中所有的电子表格
* 并缓存在mySheetList中
*
* @param filename
* @throws Exception
*/
public int process(String filename) throws Exception {
filePath = filename;
OPCPackage pkg = OPCPackage.open(filename);
doSheet(pkg);
//关闭并不保存
pkg.revert();//pkg.close();关闭保存文件(这样md5就不一致了)
return totalRows; //返回该excel文件的总行数,不包括首列和空行
}
public int process(InputStream inputStream) throws Exception {
OPCPackage pkg = OPCPackage.open(inputStream);
doSheet(pkg);
//关闭并不保存
pkg.revert();
return totalRows; //返回该excel文件的总行数,不包括首列和空行
}
public void doSheet(OPCPackage pkg) throws Exception {
XSSFReader xssfReader = new XSSFReader(pkg);
stylesTable = xssfReader.getStylesTable();
SharedStringsTable sst = (SharedStringsTable) xssfReader.getSharedStringsTable();
XMLReader parser = fetchSheetParser(sst);
XSSFReader.SheetIterator sheets = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
while (sheets.hasNext() && sheetIndex < SHEET_NUM) { //遍历sheet
curRow = 1; //标记初始行为第一行
sheetIndex++;
InputStream sheet = sheets.next(); //sheets.next()和sheets.getSheetName()不能换位置,否则sheetName报错
sheetName = sheets.getSheetName();
InputSource sheetSource = new InputSource(sheet);
parser.parse(sheetSource); //解析excel的每条记录,在这个过程中startElement()、characters()、endElement()这三个函数会依次执行
sheet.close();
}
sst.close();
}
/**
* 解析成xml
*
* @param sst
* @return
* @throws SAXException
*/
public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
XMLReader parser = XMLReaderFactory.createXMLReader();
this.sst = sst;
parser.setContentHandler(this);
return parser;
}
/**
* 第一个执行
*
* @param uri 命名空间 URI,如果元素没有命名空间 URI 或未执行命名空间处理,则为空字符串。
* @param localName 本地名称(不带前缀)或空字符串(如果未执行命名空间处理)。
* @param name 限定名称(带前缀)或空字符串(如果限定名称不可用)。
* @param attributes 附加到元素的属性。如果没有属性,则为空属性对象
* @throws SAXException
*/
@Override
public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
// 获取总行号 格式: A1:B5 取最后一个值即可
if ("dimension".equals(name)) {
String dimensionStr = attributes.getValue("ref");
totalRowCount = totalCount(dimensionStr) - 1;
}
// c => 单元格
if (name.equals("c")) {
// 如果下一个元素是 SST 的索引,则将nextIsString标记为true
String cellType = attributes.getValue("t");
String rowStr = attributes.getValue("r");
curCol = this.getRowIndex(rowStr);
if (cellType != null && cellType.equals("s")) {
nextIsString = true;
} else {
nextIsString = false;
}
}
// 置空
lastIndex = "";
}
/**
* 第二个执行
* 得到单元格对应的索引值或是内容值
* 如果单元格类型是字符串、INLINESTR、数字、日期,lastIndex则是索引值
* 如果单元格类型是布尔值、错误、公式,lastIndex则是内容值
*
* @param ch
* @param start
* @param length
* @throws SAXException
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
lastIndex += new String(ch, start, length);
}
/**
* 第三个执行
*
* @param uri
* @param localName
* @param name
* @throws SAXException
*/
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
// 根据SST的索引值的到单元格的真正要存储的字符串
// 这时characters()方法可能会被调用多次
if (nextIsString) {
try {
int idx = Integer.parseInt(lastIndex);
lastIndex = new XSSFRichTextString(sst.getItemAt(idx).getString()).toString();
} catch (Exception e) {
}
}
// v => 单元格的值,如果单元格是字符串则v标签的值为该字符串在SST中的索引
// 将单元格内容加入rowlist中,在这之前先去掉字符串前后的空白符
if (name.equals("v")) {
String value = lastIndex.trim();
value = value.equals("") ? "" : value;
int cols = curCol - preCol;
if (cols > 1) {
for (int i = 0; i < cols - 1; i++) {
cellList.add(preCol, "");
}
}
preCol = curCol;
cellList.add(curCol - 1, value);
} else {
rowlistLast(name);
}
}
private void rowlistLast(String name) {
//如果标签名称为 row ,这说明已到行尾,调用 optRows() 方法
if (name.equals("row")) {
int tmpCols = cellList.size();
if (curRow > this.titleRow && tmpCols < this.rowsize) {
for (int i = 0; i < this.rowsize - tmpCols; i++) {
cellList.add(cellList.size(), "");
}
}
if (curRow == 1) {
titlelist = cellList;
excelReadDataDelegated.readExcelDate(sheetIndex, totalRowCount, curRow, cellList);
} else if (curRow >= 1) {
//optRows(sheetIndex, curRow, cellList);
// System.out.println(JSONUtil.toJsonStr(cellList));
excelReadDataDelegated.readExcelDate(sheetIndex, totalRowCount, curRow, cellList);
}
if (curRow == this.titleRow) {
this.rowsize = cellList.size() + 7;
}
cellList = new ArrayList<>();
curRow++;
curCol = 0;
preCol = 0;
}
}
/**
* 处理数据类型
*
* @param attributes
*/
public void setNextDataType(Attributes attributes) {
nextDataType = CellDataType.NUMBER; //cellType为空,则表示该单元格类型为数字
formatIndex = -1;
formatString = null;
String cellType = attributes.getValue("t"); //单元格类型
String cellStyleStr = attributes.getValue("s"); //
String columnData = attributes.getValue("r"); //获取单元格的位置,如A1,B1
if ("b".equals(cellType)) { //处理布尔值
nextDataType = CellDataType.BOOL;
} else if ("e".equals(cellType)) { //处理错误
nextDataType = CellDataType.ERROR;
} else if ("inlineStr".equals(cellType)) {
nextDataType = CellDataType.INLINESTR;
} else if ("s".equals(cellType)) { //处理字符串
nextDataType = CellDataType.SSTINDEX;
} else if ("str".equals(cellType)) {
nextDataType = CellDataType.FORMULA;
}
if (cellStyleStr != null) { //处理日期
int styleIndex = Integer.parseInt(cellStyleStr);
XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
formatIndex = style.getDataFormat();
formatString = style.getDataFormatString();
if (formatString.contains("m/d/yy") || formatString.contains("yyyy/mm/dd") || formatString.contains("yyyy/m/d")) {
nextDataType = CellDataType.DATE;
formatString = "yyyy-MM-dd hh:mm:ss";
}
if (formatString == null) {
nextDataType = CellDataType.NULL;
formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
}
}
}
/**
* 对解析出来的数据进行类型处理
*
* @param value 单元格的值,
* value代表解析:BOOL的为0或1, ERROR的为内容值,FORMULA的为内容值,INLINESTR的为索引值需转换为内容值,
* SSTINDEX的为索引值需转换为内容值, NUMBER为内容值,DATE为内容值
* @param thisStr 一个空字符串
* @return
*/
@SuppressWarnings("deprecation")
public String getDataValue(String value, String thisStr) {
switch (nextDataType) {
// 这几个的顺序不能随便交换,交换了很可能会导致数据错误
case BOOL: //布尔值
char first = value.charAt(0);
thisStr = first == '0' ? "FALSE" : "TRUE";
break;
case ERROR: //错误
thisStr = "\"ERROR:" + value.toString() + '"';
break;
case FORMULA: //公式
thisStr = '"' + value.toString() + '"';
break;
case INLINESTR:
XSSFRichTextString rtsi = new XSSFRichTextString(value.toString());
thisStr = rtsi.toString();
rtsi = null;
break;
case SSTINDEX: //字符串
String sstIndex = value.toString();
try {
int idx = Integer.parseInt(sstIndex);
XSSFRichTextString rtss = new XSSFRichTextString(sst.getItemAt(idx).getString());//根据idx索引值获取内容值
thisStr = rtss.toString();
rtss = null;
} catch (NumberFormatException ex) {
thisStr = value.toString();
}
break;
case NUMBER: //数字
if (formatString != null) {
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString).trim();
} else {
thisStr = value;
}
thisStr = thisStr.replace("_", "").trim();
break;
case DATE: //日期
thisStr = formatter.formatRawCellContents(Double.parseDouble(value), formatIndex, formatString);
// 对日期字符串作特殊处理,去掉T
thisStr = thisStr.replace("T", " ");
break;
default:
thisStr = " ";
break;
}
return thisStr;
}
public int countNullCell(String ref, String preRef) {
//excel2007最大行数是1048576,最大列数是16384,最后一列列名是XFD
String xfd = ref.replaceAll("\\d+", "");
String xfd_1 = preRef.replaceAll("\\d+", "");
xfd = fillChar(xfd, 3, '@', true);
xfd_1 = fillChar(xfd_1, 3, '@', true);
char[] letter = xfd.toCharArray();
char[] letter_1 = xfd_1.toCharArray();
int res = (letter[0] - letter_1[0]) * 26 * 26 + (letter[1] - letter_1[1]) * 26 + (letter[2] - letter_1[2]);
return res - 1;
}
public String fillChar(String str, int len, char let, boolean isPre) {
int len_1 = str.length();
if (len_1 < len) {
if (isPre) {
for (int i = 0; i < (len - len_1); i++) {
str = let + str;
}
} else {
for (int i = 0; i < (len - len_1); i++) {
str = str + let;
}
}
}
return str;
}
public Integer totalCount(String var) {
char[] chars = var.toCharArray();
String pat = "[A-Z]";
String res = "";
int length = chars.length;
for (int i = 1; i <= length; i++) {
if (String.valueOf(chars[length - i]).matches(pat)) {
break;
}
res = chars[length - i] + res;
}
if ("".equals(res)) {
return 0;
}
return Integer.parseInt(res);
}
//得到列索引,每一列c元素的r属性构成为字母加数字的形式,字母组合为列索引,数字组合为行索引,
//如AB45,表示为第(A-A+1)*26+(B-A+1)*26列,45行
public int getRowIndex(String rowStr) {
rowStr = rowStr.replaceAll("[^A-Z]", "");
byte[] rowAbc = rowStr.getBytes();
int len = rowAbc.length;
float num = 0;
for (int i = 0; i < len; i++) {
num += (rowAbc[i] - 'A' + 1) * Math.pow(26, len - i - 1);
}
return (int) num;
}
}
调用,测试入口 处替换自己的文件就可以测试了,本文给出了工具类和思路,具体定制化根据返回结果自由处理,记录下。