09.09.2019»»понедельник

Apache Tika Disable Tesseract

    40 - Comments
Apache Tika Disable Tesseract Average ratng: 5,0/5 2542 reviews
  1. Apache Tika Disable Tesseract Download

Apache Tika is a content detection and analysis framework, written in Java, stewarded at the Apache Software Foundation. It detects and extracts metadata and text from over a thousand different file types, and as well as providing a Java library, has server and command-line editions suitable for use from other programming languages. Cirque du soleil ovo ticketmaster.

Apache Tika Disable Tesseract Download

Apache tika disable tesseract video
Apache Tika + Tesseract-OCR to scan Chinese text in pdf
ApacheTika
pom.xml
<projectxmlns='http://maven.apache.org/POM/4.0.0'
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
xsi:schemaLocation='http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd'>
<modelVersion>4.0.0</modelVersion>
<groupId>com.swordfish.readpdf</groupId>
<artifactId>readpdf</artifactId>
<version>0.0.1</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.10</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.18</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.18</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>1.6.5</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jbig2-imageio</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.xerial</groupId>
<artifactId>sqlite-jdbc</artifactId>
<version>3.23.1</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-jpeg2000</artifactId>
<version>1.3.0</version>
</dependency>
</dependencies>
</project>
Test.java
importjava.io.ByteArrayOutputStream;
importjava.io.File;
importjava.io.IOException;
importjava.io.InputStream;
importjava.nio.charset.Charset;
importjava.nio.file.Files;
importjava.nio.file.Paths;
importorg.apache.pdfbox.pdmodel.PDDocument;
importorg.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
importorg.apache.pdfbox.text.PDFTextStripper;
importorg.apache.tika.Tika;
importorg.apache.tika.config.TikaConfig;
importorg.apache.tika.exception.TikaException;
importorg.apache.tika.metadata.Metadata;
importorg.apache.tika.parser.AutoDetectParser;
importorg.apache.tika.parser.ParseContext;
importorg.apache.tika.parser.Parser;
importorg.apache.tika.parser.ocr.TesseractOCRConfig;
importorg.apache.tika.parser.pdf.PDFParserConfig;
importorg.apache.tika.sax.BodyContentHandler;
/**
1.install tesseract => https://github.com/tesseract-ocr/tesseract/wiki
2.download your target language package from :https://github.com/tesseract-ocr/tessdata and put in the 'tessdata' folder
3.reference => https://www.woodmark.de/blog/parsing-text-within-image-files-or-embedded-images-pdfs-using-apache-tika-ocr/
*/
publicclassMain {
publicstaticvoidmain(String[] args) {
// TODO Auto-generated method stub
System.err.println(getTextFromTesseract('/work/projects/projects-2018/read_pdf2/vr.pdf'));
}
publicstaticStringgetTextFromPdfByTika(StringfilePath) {
File file =newFile(filePath);
String content;
try {
content =newTika().parseToString(file);
return content;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return'';
}
publicstaticStringgetTextFromTesseract(StringfilePath) {
try {
InputStream pdf =Files.newInputStream(Paths.get(filePath));
ByteArrayOutputStream out =newByteArrayOutputStream();
TikaConfig config =TikaConfig.getDefaultConfig();
// TikaConfig fromFile = new TikaConfig('/path/to/file');
BodyContentHandler handler =newBodyContentHandler(out);
Parser parser =newAutoDetectParser(config);
Metadata meta =newMetadata();
ParseContext parsecontext =newParseContext();
PDFParserConfig pdfConfig =newPDFParserConfig();
pdfConfig.setExtractInlineImages(true);
TesseractOCRConfig tesserConfig =newTesseractOCRConfig();
tesserConfig.setLanguage('chi_sim');
tesserConfig.setTesseractPath('/usr/local/Cellar/tesseract/3.05.01/bin');
//把chi_sim.traineddata放置在tessdata目录下
tesserConfig.setTessdataPath('/usr/local/Cellar/tesseract/3.05.01/share/tessdata');
parsecontext.set(Parser.class, parser);
parsecontext.set(PDFParserConfig.class, pdfConfig);
parsecontext.set(TesseractOCRConfig.class, tesserConfig);
parser.parse(pdf, handler, meta, parsecontext);
String s =newString(out.toByteArray(),Charset.defaultCharset());
return s;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return'';
}
}
/**
*
* @Title: getTextFromPdf
* @Description: 读取pdf文件内容
* @param filePath
* @return: 读出的pdf的内容
*/
publicstaticStringgetTextFromPdf(StringfilePath) {
PDDocument pdDoc;
try {
pdDoc =PDDocument.load(newFile(filePath));
PDFTextStripper pdfStripper =newPDFTextStripper();
String result = pdfStripper.getText(pdDoc);
return result;
} catch (InvalidPasswordException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return'';
}
}
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment