Apache Tika Disable Tesseract
09.09.2019
40 - Comments
Apache Tika Disable Tesseract Average ratng: 5,0/5 2542 reviews
Apache Tika is a content detection and analysis framework, written in Java, stewarded at the Apache Software Foundation. It detects and extracts metadata and text from over a thousand different file types, and as well as providing a Java library, has server and command-line editions suitable for use from other programming languages. Cirque du soleil ovo ticketmaster.
Apache Tika Disable Tesseract Download
Apache Tika + Tesseract-OCR to scan Chinese text in pdf
pom.xml
<projectxmlns='http://maven.apache.org/POM/4.0.0' |
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' |
xsi:schemaLocation='http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd'> |
<modelVersion>4.0.0</modelVersion> |
<groupId>com.swordfish.readpdf</groupId> |
<artifactId>readpdf</artifactId> |
<version>0.0.1</version> |
<dependencies> |
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox --> |
<dependency> |
<groupId>org.apache.pdfbox</groupId> |
<artifactId>pdfbox</artifactId> |
<version>2.0.10</version> |
</dependency> |
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers --> |
<dependency> |
<groupId>org.apache.tika</groupId> |
<artifactId>tika-parsers</artifactId> |
<version>1.18</version> |
</dependency> |
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core --> |
<dependency> |
<groupId>org.apache.tika</groupId> |
<artifactId>tika-core</artifactId> |
<version>1.18</version> |
</dependency> |
<dependency> |
<groupId>com.levigo.jbig2</groupId> |
<artifactId>levigo-jbig2-imageio</artifactId> |
<version>1.6.5</version> |
</dependency> |
<dependency> |
<groupId>org.apache.pdfbox</groupId> |
<artifactId>jbig2-imageio</artifactId> |
<version>3.0.0</version> |
</dependency> |
<dependency> |
<groupId>org.xerial</groupId> |
<artifactId>sqlite-jdbc</artifactId> |
<version>3.23.1</version> |
</dependency> |
<dependency> |
<groupId>com.github.jai-imageio</groupId> |
<artifactId>jai-imageio-core</artifactId> |
<version>1.4.0</version> |
</dependency> |
<dependency> |
<groupId>com.github.jai-imageio</groupId> |
<artifactId>jai-imageio-jpeg2000</artifactId> |
<version>1.3.0</version> |
</dependency> |
</dependencies> |
</project> |
Test.java
importjava.io.ByteArrayOutputStream; |
importjava.io.File; |
importjava.io.IOException; |
importjava.io.InputStream; |
importjava.nio.charset.Charset; |
importjava.nio.file.Files; |
importjava.nio.file.Paths; |
importorg.apache.pdfbox.pdmodel.PDDocument; |
importorg.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; |
importorg.apache.pdfbox.text.PDFTextStripper; |
importorg.apache.tika.Tika; |
importorg.apache.tika.config.TikaConfig; |
importorg.apache.tika.exception.TikaException; |
importorg.apache.tika.metadata.Metadata; |
importorg.apache.tika.parser.AutoDetectParser; |
importorg.apache.tika.parser.ParseContext; |
importorg.apache.tika.parser.Parser; |
importorg.apache.tika.parser.ocr.TesseractOCRConfig; |
importorg.apache.tika.parser.pdf.PDFParserConfig; |
importorg.apache.tika.sax.BodyContentHandler; |
/** |
1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki |
2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the 'tessdata' folder |
3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/ |
*/ |
publicclassMain { |
publicstaticvoidmain(String[] args) { |
// TODO Auto-generated method stub |
System.err.println(getTextFromTesseract('/work/projects/projects-2018/read_pdf2/vr.pdf')); |
} |
publicstaticStringgetTextFromPdfByTika(StringfilePath) { |
File file =newFile(filePath); |
String content; |
try { |
content =newTika().parseToString(file); |
return content; |
} catch (IOException e) { |
// TODO Auto-generated catch block |
e.printStackTrace(); |
} catch (TikaException e) { |
// TODO Auto-generated catch block |
e.printStackTrace(); |
} |
return''; |
} |
publicstaticStringgetTextFromTesseract(StringfilePath) { |
try { |
InputStream pdf =Files.newInputStream(Paths.get(filePath)); |
ByteArrayOutputStream out =newByteArrayOutputStream(); |
TikaConfig config =TikaConfig.getDefaultConfig(); |
// TikaConfig fromFile = new TikaConfig('/path/to/file'); |
BodyContentHandler handler =newBodyContentHandler(out); |
Parser parser =newAutoDetectParser(config); |
Metadata meta =newMetadata(); |
ParseContext parsecontext =newParseContext(); |
PDFParserConfig pdfConfig =newPDFParserConfig(); |
pdfConfig.setExtractInlineImages(true); |
TesseractOCRConfig tesserConfig =newTesseractOCRConfig(); |
tesserConfig.setLanguage('chi_sim'); |
tesserConfig.setTesseractPath('/usr/local/Cellar/tesseract/3.05.01/bin'); |
//把chi_sim.traineddata放置在tessdata目录下 |
tesserConfig.setTessdataPath('/usr/local/Cellar/tesseract/3.05.01/share/tessdata'); |
parsecontext.set(Parser.class, parser); |
parsecontext.set(PDFParserConfig.class, pdfConfig); |
parsecontext.set(TesseractOCRConfig.class, tesserConfig); |
parser.parse(pdf, handler, meta, parsecontext); |
String s =newString(out.toByteArray(),Charset.defaultCharset()); |
return s; |
} catch (Exception e) { |
// TODO Auto-generated catch block |
e.printStackTrace(); |
return''; |
} |
} |
/** |
* |
* @Title: getTextFromPdf |
* @Description: 读取pdf文件内容 |
* @param filePath |
* @return: 读出的pdf的内容 |
*/ |
publicstaticStringgetTextFromPdf(StringfilePath) { |
PDDocument pdDoc; |
try { |
pdDoc =PDDocument.load(newFile(filePath)); |
PDFTextStripper pdfStripper =newPDFTextStripper(); |
String result = pdfStripper.getText(pdDoc); |
return result; |
} catch (InvalidPasswordException e) { |
// TODO Auto-generated catch block |
e.printStackTrace(); |
} catch (IOException e) { |
// TODO Auto-generated catch block |
e.printStackTrace(); |
} |
return''; |
} |
} |
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment