JAVA程序 在PDF、Word、Excel 文件的内容中关键字检索功能(只能检索可编辑文字内容,内容里的图片等格式无法检索)
文件内容的获取不依赖于Windows环境,可在任意环境下运行程序进行检索
pdf工具包下载
https://download.csdn.net/download/qq_42049516/87617559
package com.rangz.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.rangz.util.impl.KeywordListener;
public class AttachSearch {
/**
* 附件内容关键字检索
* @param list 数据库查询结果集
* @param keywords 关键词
*/
public static void search(List<LinkedHashMap<String, String>> list, String [] keywords){
if(null != list && list.size()>0){
LinkedHashMap<String, String> row; //暂存每条数据
List<String> showList = new ArrayList<>();//查询结束后打印展示存放
String fileName = "";//文件名全路径
File file = null;
String exists = "";//检索结果,文件内容是否存在关键字信息
for (int i = 0; i < list.size(); i++) {
row = list.get(i);
fileName = row.get("FILENAME");
//文件名整理
file = new File(fileName);
if(file.exists()){//先判定是否存在该文件
exists = "";
//PDF
if(fileName.endsWith(".pdf") || fileName.endsWith(".PDF")){
exists = searchPDF(fileName, keywords);
//WORD
}else if(fileName.endsWith(".doc") || fileName.endsWith(".docx") ||
fileName.endsWith(".DOC") || fileName.endsWith(".DOCX")){
exists = searchWord(fileName, keywords);
//EXCEL
}else if(fileName.endsWith(".xls") || fileName.endsWith(".xlsx") ||
fileName.endsWith(".XLS") || fileName.endsWith(".XLSX")){
exists = searchExcel(fileName, keywords);
}
if(StringUtils.isNotBlank(exists)){
showList.add(exists +">>>>>>>>"+ fileName +">>>>>>>>"+ row.get("TITLE"));
}
}
}
System.err.println("附件关键字检索结果:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
if(null != showList && showList.size()>0){
for (int i = 0; i < showList.size(); i++) {
System.err.println(showList.get(i));
}
}
}
}
/**
* PDF内容检索
* @param fileName
* @param keyword
* @return
*/
public static String searchPDF(String fileName,String [] keywords){
PdfReader reader;
try {
reader = new PdfReader(fileName);
int pageSize = reader.getNumberOfPages();
KeywordListener render;
PdfReaderContentParser parse;
List<String> allItems;
StringBuffer sbtemp;
for(int page = 1;page <= pageSize;page++){
render = new KeywordListener();
parse = new PdfReaderContentParser(reader);
parse.processContent(page, render);
//判断本页中是否存在关键词
allItems = render.getAllItems();//所有块LIST
sbtemp = new StringBuffer("");
for(String item : allItems){//将一页中所有的块内容连接起来组成一个字符串。
sbtemp.append(item);
}
boolean exists = false;
for (int j = 0; j < keywords.length; j++) {
exists = sbtemp.toString().indexOf(keywords[j]) != -1;
if(exists) return keywords[j];
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}
/**
* Word内容检索
* @param fileName
* @param keyword
* @return
*/
public static String searchWord(String fileName, String [] keywords){
StringBuffer sbtemp = new StringBuffer("");
try{
// 判断文件后缀是 doc 还是 docx
if(fileName.endsWith(".doc") || fileName.endsWith(".DOC")){
InputStream is = new FileInputStream(new File(fileName));
WordExtractor rs = new WordExtractor(is);
sbtemp.append(rs.getText());
}else if(fileName.endsWith(".docx") || fileName.endsWith(".DOCX")){
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ext = new XWPFWordExtractor(opcPackage);
sbtemp.append(ext.getText());
}
}catch (IOException e){
e.printStackTrace();
} catch (XmlException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
}
boolean exists = false;
for (int j = 0; j < keywords.length; j++) {
exists = sbtemp.toString().indexOf(keywords[j]) != -1;
if(exists) return keywords[j];
}
return "";
}
/**
* Excel内容检索
* @param fileName
* @param keyword
* @return
*/
public static String searchExcel(String fileName, String [] keywords){
try {
List<ArrayList<HashMap<Integer, String>>> list = ExcelUtil.readExcelContentMoreSheet(fileName, 1);
if(null != list && list.size()>0){
ArrayList<HashMap<Integer, String>> sheet = new ArrayList<>();
for (int i = 0; i < list.size(); i++) {
sheet = list.get(i);
boolean exists = false;
String sheetText = ""; //每页内容做一次比对
for (HashMap<Integer, String> map : sheet) {
int idx = 0;
while (idx < map.size()) {
sheetText += map.get(idx);
idx ++;
}
}
for (int j = 0; j < keywords.length; j++) {
exists = sheetText.toString().indexOf(keywords[j]) != -1;
if(exists) return keywords[j];
}
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}
public static void main(String[] args) {
List<LinkedHashMap<String, String>> list = new ArrayList<>();
LinkedHashMap<String, String> map = new LinkedHashMap<>();
map.put("FILENAME", "/data/file/2023/2302210854567.pdf");
list.add(map);
map = new LinkedHashMap<>();
map.put("FILENAME", "/data/file/2023/2302210854567.pdf");
list.add(map);
map = new LinkedHashMap<>();
map.put("FILENAME", "/data/file/2023/2302210854567.xls");
list.add(map);
/**
* List是由数据库查询获得
* 检索文件使用List,是因为在检索结果中需要保留其它信息;
* 如不需其它信息可以改为数组形式
*/
String [] keywords = {"张珊珊", "李诗诗", "王舞舞", "赵柳柳"};
//测试检索
search(list, keywords);
}
}
PDF内容获取实现类
package com.rangz.util.impl;
import java.util.ArrayList;
import java.util.List;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
public class KeywordListener implements RenderListener {
private List<String> allItems = new ArrayList<String>();
public void beginTextBlock() {
//do nothing
}
public void renderText(TextRenderInfo renderInfo) {
allItems.add(renderInfo.getText());//保存所有的项
}
public void endTextBlock() {
//do nothing
}
public void renderImage(ImageRenderInfo renderInfo) {
//do nothing
}
public List<String> getAllItems() {
return allItems;
}
public void setAllItems(List<String> allItems) {
this.allItems = allItems;
}
}
Excel获取内容工具类
package com.rangz.util;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.NumberToTextConverter;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
/**
* 处理excel相关业务
* @author rangz
*
*/
public class ExcelUtil {
private static Workbook wb;
private static Sheet sheet;
private static Row row;
/**
* 读取Excel数据内容
* @param filePath 附件地址
* @param n 数据内容从第n行开始
* @return ArrayList<HashMap<Integer, String>> 包含每行数据内容的Map对象
*/
public static ArrayList<LinkedHashMap<Integer, String>> readExcelContent( String filePath, int n) {
ArrayList<LinkedHashMap<Integer, String>> content = new ArrayList<LinkedHashMap<Integer, String>>();
try {
try {
wb = new XSSFWorkbook(filePath);// 2007版
} catch (Exception ex) {
wb = new HSSFWorkbook(new FileInputStream(filePath));// 2003版
}
} catch (IOException e) {
e.printStackTrace();
}
sheet = wb.getSheetAt(0);
// 得到总行数
int rowNum = sheet.getLastRowNum();
row = sheet.getRow(0);
int colNum = row.getPhysicalNumberOfCells();
LinkedHashMap<Integer, String> map = new LinkedHashMap<Integer, String>();
// 正文内容应该从第n行开始
for (int i = (n - 1); i <= rowNum; i++) {
row = sheet.getRow(i);
// 正文该从第几列开始
int j = 0;
while (j < colNum) {
String stringCellValue ="";
try{
stringCellValue = getStringCellValue(row.getCell(j));
}catch(NullPointerException e){
rowNum = j;
break;
}
map.put(j, stringCellValue);
j++;
}
content.add(map);
map = new LinkedHashMap<Integer, String>();
}
return content;
}
/**
* 获取单元格数据内容为字符串类型的数据
* @param cell Excel单元格
* @return String 单元格数据内容
*/
private static String getStringCellValue(Cell cell) {
ExcelUtil ex = new ExcelUtil();
String strCell = "";
if (null == cell)
return "";
switch (cell.getCellType()) {
//公式
case Cell.CELL_TYPE_FORMULA:
try {
strCell = String.valueOf(cell.getStringCellValue());
} catch (IllegalStateException e) {
strCell = String.valueOf(cell.getNumericCellValue());
}
break;
case HSSFCell.CELL_TYPE_STRING:
strCell = cell.getStringCellValue();
break;
case HSSFCell.CELL_TYPE_NUMERIC:
// 注意:该类,会把,Excel中的,日期,也当成数字,所以在读取时,一定要判断是否是日期格式,如果是,则格式化成日期。
if (DateUtil.isCellDateFormatted(cell)) {
short format = cell.getCellStyle().getDataFormat();
if(format == 14 || format == 31 || format == 57 || format == 58){
//日期
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
strCell = sdf.format(cell.getDateCellValue());
}else if ( HSSFDateUtil.isCellDateFormatted(cell)) {
Date date = cell.getDateCellValue();
DateFormat formater = new SimpleDateFormat("yyyy-MM-dd");
strCell = formater.format(date);
}else{
// 取得当前Cell的数值
strCell = NumberToTextConverter.toText(cell.getNumericCellValue());
}
} else {
// 读取数字
// 去掉小数点后多余的0
strCell = ex.getNoMoreZero(cell.getNumericCellValue() + "");
}
break;
case HSSFCell.CELL_TYPE_BOOLEAN:
strCell = String.valueOf(cell.getBooleanCellValue());
break;
case HSSFCell.CELL_TYPE_BLANK:
strCell = "";
break;
default:
strCell = "";
break;
}
if (strCell.equals("") || strCell == null) {
return "";
}
// 去掉前后空格
strCell = strCell.trim();
return strCell;
}
/**
* [去掉字符后面多余的".0"]
* @param sNum
* @return
*/
public String getNoMoreZero(String sNum) {
String resultStr = sNum;
if (sNum.indexOf(".") > 0) {
// 去掉多余的0
resultStr = resultStr.replaceAll("0+?$", "");
// 去掉最后的"."
resultStr = resultStr.replaceAll("[.]$", "");
}
return resultStr;
}
/**
* 读取多sheet页的Excel文件,单sheet为一个list
* @param filePath 附件绝对地址
* @param n 开始行数
* @return
*/
public static ArrayList<ArrayList<HashMap<Integer, String>>> readExcelContentMoreSheet(String filePath, int n) {
ArrayList<ArrayList<HashMap<Integer, String>>> rList = new ArrayList<ArrayList<HashMap<Integer, String>>>();
ArrayList<HashMap<Integer, String>> content = new ArrayList<HashMap<Integer, String>>();
try {
try {
wb = new XSSFWorkbook(filePath);// 2007版
} catch (Exception ex) {
wb = new HSSFWorkbook(new FileInputStream(filePath));// 2003版
}
} catch (IOException e) {
e.printStackTrace();
}
for(int k=0; k<wb.getNumberOfSheets(); k++){
content = new ArrayList<HashMap<Integer, String>>();
sheet = wb.getSheetAt(k);
// 得到总行数
int rowNum = sheet.getLastRowNum();
row = sheet.getRow(0);
int colNum = row.getPhysicalNumberOfCells();
HashMap<Integer, String> map = new HashMap<Integer, String>();
// 正文内容应该从第n行开始
for (int i = (n - 0); i <= rowNum; i++) {
row = sheet.getRow(i);
// 正文该从第几列开始
int j = 0;
while (j < colNum) {
String stringCellValue = getStringCellValue(row.getCell(j));
map.put(j, stringCellValue);
j++;
}
content.add(map);
map = new HashMap<Integer, String>();
}
rList.add(content);
}
return rList;
}
}