Bootstrap

java PDF/Word/Excel文件内容关键字检索

JAVA程序 在PDF、Word、Excel 文件的内容中关键字检索功能(只能检索可编辑文字内容,内容里的图片等格式无法检索
文件内容的获取不依赖于Windows环境,可在任意环境下运行程序进行检索

pdf工具包下载
https://download.csdn.net/download/qq_42049516/87617559

package com.rangz.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.rangz.util.impl.KeywordListener;

public class AttachSearch {


    /**
     * 附件内容关键字检索
     * @param list 数据库查询结果集
     * @param keywords 关键词
     */
    public static void search(List<LinkedHashMap<String, String>> list, String [] keywords){
        if(null != list && list.size()>0){
            LinkedHashMap<String, String> row; //暂存每条数据
            List<String> showList = new ArrayList<>();//查询结束后打印展示存放
            String fileName = "";//文件名全路径
            File file = null;
            String exists = "";//检索结果,文件内容是否存在关键字信息
            for (int i = 0; i < list.size(); i++) {
                row = list.get(i);
                fileName = row.get("FILENAME");
                //文件名整理
                file = new File(fileName);
                if(file.exists()){//先判定是否存在该文件
                    exists = "";
                    //PDF
                    if(fileName.endsWith(".pdf") || fileName.endsWith(".PDF")){
                        exists = searchPDF(fileName, keywords);
                    //WORD
                    }else if(fileName.endsWith(".doc") || fileName.endsWith(".docx") ||
                            fileName.endsWith(".DOC") || fileName.endsWith(".DOCX")){
                        exists = searchWord(fileName, keywords);
                    //EXCEL
                    }else if(fileName.endsWith(".xls") || fileName.endsWith(".xlsx") ||
                            fileName.endsWith(".XLS") || fileName.endsWith(".XLSX")){
                        exists = searchExcel(fileName, keywords);
                    }
                    if(StringUtils.isNotBlank(exists)){
                        showList.add(exists +">>>>>>>>"+ fileName +">>>>>>>>"+ row.get("TITLE"));
                    }
                }
            }
            System.err.println("附件关键字检索结果:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>");
            if(null != showList && showList.size()>0){
                for (int i = 0; i < showList.size(); i++) {
                    System.err.println(showList.get(i));
                }
            }
        }
    }





    /**
     * PDF内容检索
     * @param fileName
     * @param keyword
     * @return
     */
    public static String searchPDF(String fileName,String [] keywords){
        PdfReader reader;
        try {
            reader = new PdfReader(fileName);
            int pageSize = reader.getNumberOfPages(); 
            KeywordListener render;
            PdfReaderContentParser parse;
            List<String> allItems;
            StringBuffer sbtemp;
            for(int page = 1;page <= pageSize;page++){
                render = new KeywordListener(); 
                parse = new PdfReaderContentParser(reader); 
                parse.processContent(page, render); 
                //判断本页中是否存在关键词 
                allItems = render.getAllItems();//所有块LIST 
                sbtemp = new StringBuffer(""); 
                for(String item : allItems){//将一页中所有的块内容连接起来组成一个字符串。 
                    sbtemp.append(item); 
                }
                boolean exists = false;
                for (int j = 0; j < keywords.length; j++) {
                    exists = sbtemp.toString().indexOf(keywords[j]) != -1;
                    if(exists) return keywords[j];
                }
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } 
        return "";
    }

    /**
     * Word内容检索
     * @param fileName
     * @param keyword
     * @return
     */
    public static String searchWord(String fileName, String [] keywords){
        StringBuffer sbtemp = new StringBuffer(""); 
        try{
            // 判断文件后缀是 doc 还是 docx
            if(fileName.endsWith(".doc") || fileName.endsWith(".DOC")){
                InputStream is = new FileInputStream(new File(fileName));
                WordExtractor rs = new WordExtractor(is);
                sbtemp.append(rs.getText());
            }else if(fileName.endsWith(".docx") || fileName.endsWith(".DOCX")){
                OPCPackage opcPackage  = POIXMLDocument.openPackage(fileName);
                POIXMLTextExtractor ext =  new XWPFWordExtractor(opcPackage);
                sbtemp.append(ext.getText());
            }
        }catch (IOException e){
            e.printStackTrace();
        } catch (XmlException e) {
            e.printStackTrace();
        } catch (OpenXML4JException e) {
            e.printStackTrace();
        }
        boolean exists = false;
        for (int j = 0; j < keywords.length; j++) {
            exists = sbtemp.toString().indexOf(keywords[j]) != -1;
            if(exists) return keywords[j];
        }
        return "";
    }

    /**
     * Excel内容检索
     * @param fileName
     * @param keyword
     * @return
     */
    public static String searchExcel(String fileName, String [] keywords){
        try {
            List<ArrayList<HashMap<Integer, String>>> list = ExcelUtil.readExcelContentMoreSheet(fileName, 1);
            if(null != list && list.size()>0){
                ArrayList<HashMap<Integer, String>> sheet = new ArrayList<>();
                for (int i = 0; i < list.size(); i++) {
                    sheet = list.get(i);
                    boolean exists = false;
                    String sheetText = ""; //每页内容做一次比对
                    for (HashMap<Integer, String> map : sheet) {
                        int idx = 0;
                        while (idx < map.size()) {
                            sheetText += map.get(idx);
                            idx ++;
                        }
                    }
                    for (int j = 0; j < keywords.length; j++) {
                        exists = sheetText.toString().indexOf(keywords[j]) != -1;
                        if(exists) return keywords[j];
                    }
                }
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return "";
    }

    
    
    
    
    public static void main(String[] args) {
        
        List<LinkedHashMap<String, String>> list = new ArrayList<>();
        LinkedHashMap<String, String> map = new LinkedHashMap<>();
        map.put("FILENAME", "/data/file/2023/2302210854567.pdf");
        list.add(map);
        map = new LinkedHashMap<>();
        map.put("FILENAME", "/data/file/2023/2302210854567.pdf");
        list.add(map);
        map = new LinkedHashMap<>();
        map.put("FILENAME", "/data/file/2023/2302210854567.xls");
        list.add(map);
        
        /**
         * List是由数据库查询获得
         * 检索文件使用List,是因为在检索结果中需要保留其它信息;
         * 如不需其它信息可以改为数组形式
         */
        
        String [] keywords = {"张珊珊", "李诗诗", "王舞舞", "赵柳柳"};
        
        //测试检索
        search(list, keywords);
        
    }
}


PDF内容获取实现类

package com.rangz.util.impl;
import java.util.ArrayList; 
import java.util.List;

import com.itextpdf.text.pdf.parser.ImageRenderInfo; 
import com.itextpdf.text.pdf.parser.RenderListener; 
import com.itextpdf.text.pdf.parser.TextRenderInfo; 

public class KeywordListener implements RenderListener { 
    
    private List<String> allItems = new ArrayList<String>();

    public void beginTextBlock() {
        //do nothing
    }

    public void renderText(TextRenderInfo renderInfo) {
        allItems.add(renderInfo.getText());//保存所有的项
    }

    public void endTextBlock() {
        //do nothing
    }

    public void renderImage(ImageRenderInfo renderInfo) {
        //do nothing
    }

    public List<String> getAllItems() {
        return allItems;
    }

    public void setAllItems(List<String> allItems) {
        this.allItems = allItems;
    }
}

Excel获取内容工具类

package com.rangz.util;

import java.io.FileInputStream;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.NumberToTextConverter;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

/**
 * 处理excel相关业务
 * @author rangz
 *
 */
public class ExcelUtil {
	private static Workbook wb;
	private static Sheet sheet;
	private static Row row;
	
	/**
	 * 读取Excel数据内容
	 * @param filePath 附件地址
	 * @param n 数据内容从第n行开始
	 * @return ArrayList<HashMap<Integer, String>> 包含每行数据内容的Map对象
	 */
	public static ArrayList<LinkedHashMap<Integer, String>> readExcelContent( String filePath, int n) {
		ArrayList<LinkedHashMap<Integer, String>> content = new ArrayList<LinkedHashMap<Integer, String>>();
		try {
			try {
				wb = new XSSFWorkbook(filePath);// 2007版
			} catch (Exception ex) {
				wb = new HSSFWorkbook(new FileInputStream(filePath));// 2003版
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		sheet = wb.getSheetAt(0);
		// 得到总行数
		int rowNum = sheet.getLastRowNum();
		row = sheet.getRow(0);
		int colNum = row.getPhysicalNumberOfCells();
		LinkedHashMap<Integer, String> map = new LinkedHashMap<Integer, String>();
		// 正文内容应该从第n行开始
		for (int i = (n - 1); i <= rowNum; i++) {
			row = sheet.getRow(i);
			// 正文该从第几列开始
			int j = 0;
			while (j < colNum) {
				String stringCellValue ="";
				try{
					stringCellValue = getStringCellValue(row.getCell(j));
				}catch(NullPointerException e){
					rowNum = j;
					break;
				}
				map.put(j, stringCellValue);
				j++;
			}
			content.add(map);
			map = new LinkedHashMap<Integer, String>();
		}
		return content;
	}
	/**
	 * 获取单元格数据内容为字符串类型的数据
	 * @param cell Excel单元格
	 * @return String 单元格数据内容
	 */
	private static String getStringCellValue(Cell cell) {

		ExcelUtil ex = new ExcelUtil();
		String strCell = "";
		if (null == cell)
			return "";

		switch (cell.getCellType()) {
		//公式
		case Cell.CELL_TYPE_FORMULA:
    		try {
    	       strCell = String.valueOf(cell.getStringCellValue());
    		} catch (IllegalStateException e) {
    	       strCell = String.valueOf(cell.getNumericCellValue());
    		} 
    		break;
		case HSSFCell.CELL_TYPE_STRING:
			strCell = cell.getStringCellValue();
			break;
		case HSSFCell.CELL_TYPE_NUMERIC:
			// 注意:该类,会把,Excel中的,日期,也当成数字,所以在读取时,一定要判断是否是日期格式,如果是,则格式化成日期。
			if (DateUtil.isCellDateFormatted(cell)) {
				short format = cell.getCellStyle().getDataFormat();  
			    if(format == 14 || format == 31 || format == 57 || format == 58){  
			        //日期  
			    	SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");  
			    	strCell = sdf.format(cell.getDateCellValue());
			    }else if ( HSSFDateUtil.isCellDateFormatted(cell)) {
			    	Date date = cell.getDateCellValue();  
                    DateFormat formater = new SimpleDateFormat("yyyy-MM-dd");  
                    strCell = formater.format(date);  
				}else{
					// 取得当前Cell的数值  
					strCell = NumberToTextConverter.toText(cell.getNumericCellValue());
				}
			} else {
				// 读取数字
				// 去掉小数点后多余的0
				strCell = ex.getNoMoreZero(cell.getNumericCellValue() + "");
			}
			break;
		case HSSFCell.CELL_TYPE_BOOLEAN:
			strCell = String.valueOf(cell.getBooleanCellValue());
			break;
		case HSSFCell.CELL_TYPE_BLANK:
			strCell = "";
			break;
		default:
			strCell = "";
			break;
		}
		if (strCell.equals("") || strCell == null) {
			return "";
		}
		// 去掉前后空格
		strCell = strCell.trim();
		return strCell;
	}


	/**
	 * [去掉字符后面多余的".0"]
	 * @param sNum
	 * @return
	 */
	public String getNoMoreZero(String sNum) {
		String resultStr = sNum;
		if (sNum.indexOf(".") > 0) {
			// 去掉多余的0
			resultStr = resultStr.replaceAll("0+?$", "");
			// 去掉最后的"."
			resultStr = resultStr.replaceAll("[.]$", "");
		}
		return resultStr;
	}
	
	
	/**
	 *  读取多sheet页的Excel文件,单sheet为一个list
	 * @param filePath 附件绝对地址
	 * @param n 开始行数
	 * @return
	 */
	public static ArrayList<ArrayList<HashMap<Integer, String>>> readExcelContentMoreSheet(String filePath, int n) {
		ArrayList<ArrayList<HashMap<Integer, String>>> rList = new ArrayList<ArrayList<HashMap<Integer, String>>>();
		ArrayList<HashMap<Integer, String>> content = new ArrayList<HashMap<Integer, String>>();
		try {
			try {
				wb = new XSSFWorkbook(filePath);// 2007版
			} catch (Exception ex) {
				wb = new HSSFWorkbook(new FileInputStream(filePath));// 2003版
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		for(int k=0; k<wb.getNumberOfSheets(); k++){
			content = new ArrayList<HashMap<Integer, String>>();
			sheet = wb.getSheetAt(k);
			// 得到总行数
			int rowNum = sheet.getLastRowNum();
			row = sheet.getRow(0);
			int colNum = row.getPhysicalNumberOfCells();
			HashMap<Integer, String> map = new HashMap<Integer, String>();
			// 正文内容应该从第n行开始
			for (int i = (n - 0); i <= rowNum; i++) {
				row = sheet.getRow(i);
				// 正文该从第几列开始
				int j = 0;
				while (j < colNum) {
					String stringCellValue = getStringCellValue(row.getCell(j));
					map.put(j, stringCellValue);
					j++;
				}
				content.add(map);
				map = new HashMap<Integer, String>();
			}
			rList.add(content);
		}
		return rList;
	}
}

;