Apache Tika Disable Tesseract
09.09.2019
40 - Comments
Apache Tika Disable Tesseract Average ratng: 5,0/5 2542 reviews
Apache Tika is a content detection and analysis framework, written in Java, stewarded at the Apache Software Foundation. It detects and extracts metadata and text from over a thousand different file types, and as well as providing a Java library, has server and command-line editions suitable for use from other programming languages. Cirque du soleil ovo ticketmaster.
Apache Tika Disable Tesseract Download

Apache Tika + Tesseract-OCR to scan Chinese text in pdf


pom.xml
| <projectxmlns='http://maven.apache.org/POM/4.0.0' |
| xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' |
| xsi:schemaLocation='http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd'> |
| <modelVersion>4.0.0</modelVersion> |
| <groupId>com.swordfish.readpdf</groupId> |
| <artifactId>readpdf</artifactId> |
| <version>0.0.1</version> |
| <dependencies> |
| <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox --> |
| <dependency> |
| <groupId>org.apache.pdfbox</groupId> |
| <artifactId>pdfbox</artifactId> |
| <version>2.0.10</version> |
| </dependency> |
| <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers --> |
| <dependency> |
| <groupId>org.apache.tika</groupId> |
| <artifactId>tika-parsers</artifactId> |
| <version>1.18</version> |
| </dependency> |
| <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core --> |
| <dependency> |
| <groupId>org.apache.tika</groupId> |
| <artifactId>tika-core</artifactId> |
| <version>1.18</version> |
| </dependency> |
| <dependency> |
| <groupId>com.levigo.jbig2</groupId> |
| <artifactId>levigo-jbig2-imageio</artifactId> |
| <version>1.6.5</version> |
| </dependency> |
| <dependency> |
| <groupId>org.apache.pdfbox</groupId> |
| <artifactId>jbig2-imageio</artifactId> |
| <version>3.0.0</version> |
| </dependency> |
| <dependency> |
| <groupId>org.xerial</groupId> |
| <artifactId>sqlite-jdbc</artifactId> |
| <version>3.23.1</version> |
| </dependency> |
| <dependency> |
| <groupId>com.github.jai-imageio</groupId> |
| <artifactId>jai-imageio-core</artifactId> |
| <version>1.4.0</version> |
| </dependency> |
| <dependency> |
| <groupId>com.github.jai-imageio</groupId> |
| <artifactId>jai-imageio-jpeg2000</artifactId> |
| <version>1.3.0</version> |
| </dependency> |
| </dependencies> |
| </project> |
Test.java
| importjava.io.ByteArrayOutputStream; |
| importjava.io.File; |
| importjava.io.IOException; |
| importjava.io.InputStream; |
| importjava.nio.charset.Charset; |
| importjava.nio.file.Files; |
| importjava.nio.file.Paths; |
| importorg.apache.pdfbox.pdmodel.PDDocument; |
| importorg.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; |
| importorg.apache.pdfbox.text.PDFTextStripper; |
| importorg.apache.tika.Tika; |
| importorg.apache.tika.config.TikaConfig; |
| importorg.apache.tika.exception.TikaException; |
| importorg.apache.tika.metadata.Metadata; |
| importorg.apache.tika.parser.AutoDetectParser; |
| importorg.apache.tika.parser.ParseContext; |
| importorg.apache.tika.parser.Parser; |
| importorg.apache.tika.parser.ocr.TesseractOCRConfig; |
| importorg.apache.tika.parser.pdf.PDFParserConfig; |
| importorg.apache.tika.sax.BodyContentHandler; |
| /** |
| 1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki |
| 2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the 'tessdata' folder |
| 3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/ |
| */ |
| publicclassMain { |
| publicstaticvoidmain(String[] args) { |
| // TODO Auto-generated method stub |
| System.err.println(getTextFromTesseract('/work/projects/projects-2018/read_pdf2/vr.pdf')); |
| } |
| publicstaticStringgetTextFromPdfByTika(StringfilePath) { |
| File file =newFile(filePath); |
| String content; |
| try { |
| content =newTika().parseToString(file); |
| return content; |
| } catch (IOException e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| } catch (TikaException e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| } |
| return''; |
| } |
| publicstaticStringgetTextFromTesseract(StringfilePath) { |
| try { |
| InputStream pdf =Files.newInputStream(Paths.get(filePath)); |
| ByteArrayOutputStream out =newByteArrayOutputStream(); |
| TikaConfig config =TikaConfig.getDefaultConfig(); |
| // TikaConfig fromFile = new TikaConfig('/path/to/file'); |
| BodyContentHandler handler =newBodyContentHandler(out); |
| Parser parser =newAutoDetectParser(config); |
| Metadata meta =newMetadata(); |
| ParseContext parsecontext =newParseContext(); |
| PDFParserConfig pdfConfig =newPDFParserConfig(); |
| pdfConfig.setExtractInlineImages(true); |
| TesseractOCRConfig tesserConfig =newTesseractOCRConfig(); |
| tesserConfig.setLanguage('chi_sim'); |
| tesserConfig.setTesseractPath('/usr/local/Cellar/tesseract/3.05.01/bin'); |
| //把chi_sim.traineddata放置在tessdata目录下 |
| tesserConfig.setTessdataPath('/usr/local/Cellar/tesseract/3.05.01/share/tessdata'); |
| parsecontext.set(Parser.class, parser); |
| parsecontext.set(PDFParserConfig.class, pdfConfig); |
| parsecontext.set(TesseractOCRConfig.class, tesserConfig); |
| parser.parse(pdf, handler, meta, parsecontext); |
| String s =newString(out.toByteArray(),Charset.defaultCharset()); |
| return s; |
| } catch (Exception e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| return''; |
| } |
| } |
| /** |
| * |
| * @Title: getTextFromPdf |
| * @Description: 读取pdf文件内容 |
| * @param filePath |
| * @return: 读出的pdf的内容 |
| */ |
| publicstaticStringgetTextFromPdf(StringfilePath) { |
| PDDocument pdDoc; |
| try { |
| pdDoc =PDDocument.load(newFile(filePath)); |
| PDFTextStripper pdfStripper =newPDFTextStripper(); |
| String result = pdfStripper.getText(pdDoc); |
| return result; |
| } catch (InvalidPasswordException e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| } catch (IOException e) { |
| // TODO Auto-generated catch block |
| e.printStackTrace(); |
| } |
| return''; |
| } |
| } |
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment