Apache Tika Disable Tesseract

09.09.2019

40 - Comments

logobossally.netlify.com › 〓 Apache Tika Disable Tesseract 〓

Apache Tika Disable Tesseract Average ratng: 5,0/5 2542 reviews

Apache Tika Disable Tesseract Download

Apache Tika is a content detection and analysis framework, written in Java, stewarded at the Apache Software Foundation. It detects and extracts metadata and text from over a thousand different file types, and as well as providing a Java library, has server and command-line editions suitable for use from other programming languages. Cirque du soleil ovo ticketmaster.

Apache Tika Disable Tesseract Download

Apache Tika + Tesseract-OCR to scan Chinese text in pdf

pom.xml

<projectxmlns='http://maven.apache.org/POM/4.0.0'

xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'

xsi:schemaLocation='http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd'>

<groupId>com.swordfish.readpdf</groupId>

<artifactId>readpdf</artifactId>

<groupId>org.apache.pdfbox</groupId>

<artifactId>pdfbox</artifactId>

</dependency>

<groupId>org.apache.tika</groupId>

<artifactId>tika-parsers</artifactId>

</dependency>

<groupId>org.apache.tika</groupId>

</dependency>

<groupId>com.levigo.jbig2</groupId>

<artifactId>levigo-jbig2-imageio</artifactId>

</dependency>

<groupId>org.apache.pdfbox</groupId>

<artifactId>jbig2-imageio</artifactId>

</dependency>

<groupId>org.xerial</groupId>

<artifactId>sqlite-jdbc</artifactId>

</dependency>

<groupId>com.github.jai-imageio</groupId>

<artifactId>jai-imageio-core</artifactId>

</dependency>

<groupId>com.github.jai-imageio</groupId>

<artifactId>jai-imageio-jpeg2000</artifactId>

</dependency>

</dependencies>

</project>

Test.java

importjava.io.ByteArrayOutputStream;

importjava.io.File;

importjava.io.IOException;

importjava.io.InputStream;

importjava.nio.charset.Charset;

importjava.nio.file.Files;

importjava.nio.file.Paths;

importorg.apache.pdfbox.pdmodel.PDDocument;

importorg.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;

importorg.apache.pdfbox.text.PDFTextStripper;

importorg.apache.tika.Tika;

importorg.apache.tika.config.TikaConfig;

importorg.apache.tika.exception.TikaException;

importorg.apache.tika.metadata.Metadata;

importorg.apache.tika.parser.AutoDetectParser;

importorg.apache.tika.parser.ParseContext;

importorg.apache.tika.parser.Parser;

importorg.apache.tika.parser.ocr.TesseractOCRConfig;

importorg.apache.tika.parser.pdf.PDFParserConfig;

importorg.apache.tika.sax.BodyContentHandler;

/**

1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki

2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the 'tessdata' folder

3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/

publicclassMain {

publicstaticvoidmain(String[] args) {

// TODO Auto-generated method stub

System.err.println(getTextFromTesseract('/work/projects/projects-2018/read_pdf2/vr.pdf'));

}

publicstaticStringgetTextFromPdfByTika(StringfilePath) {

File file =newFile(filePath);

String content;

try {

content =newTika().parseToString(file);

return content;

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (TikaException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return'';

}

publicstaticStringgetTextFromTesseract(StringfilePath) {

try {

InputStream pdf =Files.newInputStream(Paths.get(filePath));

ByteArrayOutputStream out =newByteArrayOutputStream();

TikaConfig config =TikaConfig.getDefaultConfig();

// TikaConfig fromFile = new TikaConfig('/path/to/file');

BodyContentHandler handler =newBodyContentHandler(out);

Parser parser =newAutoDetectParser(config);

Metadata meta =newMetadata();

ParseContext parsecontext =newParseContext();

PDFParserConfig pdfConfig =newPDFParserConfig();

pdfConfig.setExtractInlineImages(true);

TesseractOCRConfig tesserConfig =newTesseractOCRConfig();

tesserConfig.setLanguage('chi_sim');

tesserConfig.setTesseractPath('/usr/local/Cellar/tesseract/3.05.01/bin');

//把chi_sim.traineddata放置在tessdata目录下

tesserConfig.setTessdataPath('/usr/local/Cellar/tesseract/3.05.01/share/tessdata');

parsecontext.set(Parser.class, parser);

parsecontext.set(PDFParserConfig.class, pdfConfig);

parsecontext.set(TesseractOCRConfig.class, tesserConfig);

parser.parse(pdf, handler, meta, parsecontext);

String s =newString(out.toByteArray(),Charset.defaultCharset());

return s;

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

return'';

}

/**

* @Title: getTextFromPdf

* @Description: 读取pdf文件内容

* @param filePath

* @return: 读出的pdf的内容

publicstaticStringgetTextFromPdf(StringfilePath) {

PDDocument pdDoc;

try {

pdDoc =PDDocument.load(newFile(filePath));

PDFTextStripper pdfStripper =newPDFTextStripper();

String result = pdfStripper.getText(pdDoc);

return result;

} catch (InvalidPasswordException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return'';

}

logobossally

Apache Tika Disable Tesseract Download

Most Popular News