1200字范文 > java查找pdf关键字_java实现查找PDF关键字所在页码及其坐标

java查找pdf关键字_java实现查找PDF关键字所在页码及其坐标

时间：2021-09-26 20:26:52

1、因为最近有这方面的需求，用过之后记录一下。

2、此功能跟PDF中Ctrl+F性质一样，如果PDF中为图片形式的不支持定位到关键字。

import com.itextpdf.awt.geom.Rectangle2D.Float;

import com.itextpdf.text.pdf.PdfDictionary;

import com.itextpdf.text.pdf.PdfName;

import com.itextpdf.text.pdf.PdfReader;

import com.itextpdf.text.pdf.parser.*;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

/**

* 消失的太阳

public class MyTest {

public static void main(String[] args) throws IOException {

//1.给定文件

File pdfFile = new File("D://test.pdf");

//2.定义一个byte数组，长度为文件的长度

byte[] pdfData = new byte[(int) pdfFile.length()];

//3.IO流读取文件内容到byte数组

FileInputStream inputStream = null;

try {

inputStream = new FileInputStream(pdfFile);

inputStream.read(pdfData);

} catch (IOException e) {

throw e;

} finally {

if (inputStream != null) {

try {

inputStream.close();

} catch (IOException e) {

}

//4.指定关键字

String keyword = "消失的太阳：";

//5.调用方法，给定关键字和文件

List positions = findKeywordPostions(pdfData, keyword);

//6.返回值类型是 List 每个list元素代表一个匹配的位置，分别为 float[0]所在页码 float[1]所在x轴 float[2]所在y轴

System.out.println("total:" + positions.size());

if (positions != null && positions.size() > 0) {

for (float[] position : positions) {

System.out.print("pageNum: " + (int) position[0]);

System.out.print("\tx: " + position[1]);

System.out.println("\ty: " + position[2]);

}

/**

* findKeywordPostions

* @param pdfData 通过IO流 PDF文件转化的byte数组

* @param keyword 关键字

* @return List : float[0]:pageNum float[1]:x float[2]:y

* @throws IOException

public static List findKeywordPostions(byte[] pdfData, String keyword) throws IOException {

List result = new ArrayList<>();

List pdfPageContentPositions = getPdfContentPostionsList(pdfData);

for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {

List charPositions = findPositions(keyword, pdfPageContentPosition);

if (charPositions == null || charPositions.size() < 1) {

continue;

}

result.addAll(charPositions);

}

return result;

}

private static List getPdfContentPostionsList(byte[] pdfData) throws IOException {

PdfReader reader = new PdfReader(pdfData);

List result = new ArrayList<>();

int pages = reader.getNumberOfPages();

for (int pageNum = 1; pageNum <= pages; pageNum++) {

float width = reader.getPageSize(pageNum).getWidth();

float height = reader.getPageSize(pageNum).getHeight();

PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);

//解析pdf，定位位置

PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);

PdfDictionary pageDic = reader.getPageN(pageNum);

PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);

try {

processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);

} catch (IOException e) {

reader.close();

throw e;

}

String content = pdfRenderListener.getContent();

List charPositions = pdfRenderListener.getcharPositions();

List positionsList = new ArrayList<>();

for (CharPosition charPosition : charPositions) {

float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};

positionsList.add(positions);

}

PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();

pdfPageContentPositions.setContent(content);

pdfPageContentPositions.setPostions(positionsList);

result.add(pdfPageContentPositions);

}

reader.close();

return result;

}

private static List findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {

List result = new ArrayList<>();

String content = pdfPageContentPositions.getContent();

List charPositions = pdfPageContentPositions.getPositions();

for (int pos = 0; pos < content.length(); ) {

int positionIndex = content.indexOf(keyword, pos);

if (positionIndex == -1) {

break;

}

float[] postions = charPositions.get(positionIndex);

result.add(postions);

pos = positionIndex + 1;

}

return result;

}

private static class PdfPageContentPositions {

private String content;

private List positions;

public String getContent() {

return content;

}

public void setContent(String content) {

this.content = content;

}

public List getPositions() {

return positions;

}

public void setPostions(List positions) {

this.positions = positions;

}

private static class PdfRenderListener implements RenderListener {

private int pageNum;

private float pageWidth;

private float pageHeight;

private StringBuilder contentBuilder = new StringBuilder();

private List charPositions = new ArrayList<>();

public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {

this.pageNum = pageNum;

this.pageWidth = pageWidth;

this.pageHeight = pageHeight;

}

public void beginTextBlock() {

}

public void renderText(TextRenderInfo renderInfo) {

List characterRenderInfos = renderInfo.getCharacterRenderInfos();

for (TextRenderInfo textRenderInfo : characterRenderInfos) {

String word = textRenderInfo.getText();

if (word.length() > 1) {

word = word.substring(word.length() - 1, word.length());

}

Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();

float x = (float)rectangle.getX();

float y = (float)rectangle.getY();

// float x = (float)rectangle.getCenterX();

// float y = (float)rectangle.getCenterY();

// double x = rectangle.getMinX();

// double y = rectangle.getMaxY();

//这两个是关键字在所在页面的XY轴的百分比

float xPercent = Math.round(x / pageWidth * 10000) / 10000f;

float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;

// CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent);

CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y);

charPositions.add(charPosition);

contentBuilder.append(word);

}

public void endTextBlock() {

}

public void renderImage(ImageRenderInfo renderInfo) {

}

public String getContent() {

return contentBuilder.toString();

}

public List getcharPositions() {

return charPositions;

}

private static class CharPosition {

private int pageNum = 0;

private float x = 0;

private float y = 0;

public CharPosition(int pageNum, float x, float y) {

this.pageNum = pageNum;

this.x = x;

this.y = y;

}

public int getPageNum() {

return pageNum;

}

public float getX() {

return x;

}

public float getY() {

return y;

}

@Override

public String toString() {

return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";

}

总结

以上所述是小编给大家介绍的java实现查找PDF关键字所在页码及其坐标,希望对大家有所帮助，如果大家有任何疑问请给我留言，小编会及时回复大家的。在此也非常感谢大家对网站的支持！

如果你觉得本文对你有帮助，欢迎转载，烦请注明出处，谢谢！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。