forked from tabulapdf/tabula-java
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPDFGetter.java
More file actions
66 lines (56 loc) · 2.57 KB
/
PDFGetter.java
File metadata and controls
66 lines (56 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// requires latest version of tabula-java built from source with PDFBox 3.x
// and saved as /home/sullija/bin/tabula.jar
// jbang --cp "/home/sullija/bin/tabula.jar" PDFGetter.java
// or
// javac -cp "/home/sullija/bin/tabula.jar" PDFGetter.java
// java -cp "/home/sullija/bin/tabula.jar" PDFGetter.java
import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import java.io.InputStream;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.Loader;
public class PDFGetter {
public void get() {
//InputStream in = this.getClass().getResourceAsStream("/home/sullija/dev/java/tabula-java/src/test/resources/technology/tabula/table_report.pdf");
try {
PDDocument document = Loader.loadPDF(new File("/home/sullija/dev/java/tabula-java/src/test/resources/technology/tabula/table_report.pdf"));
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
PageIterator pi = new ObjectExtractor(document).extract();
int pageNo = 0;
while (pi.hasNext()) {
// iterate over the pages of the document
Page page = pi.next();
if (pageNo < 3) { // Only do the first page for now
List<Table> table = sea.extract(page);
// iterate over the tables of the page
for(Table tables: table) {
List<List<RectangularTextContainer>> rows = tables.getRows();
// iterate over the rows of the table
for (List<RectangularTextContainer> cells : rows) {
// print all column-cells of the row plus linefeed
for (RectangularTextContainer content : cells) {
// Note: Cell.getText() uses \r to concat text chunks
String text = content.getText().replace("\r", " ");
System.out.print(text + "|");
}
System.out.println();
}
}
}
pageNo = pageNo + 1;
}
System.out.println("The PageNo: " + pageNo);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String []args)
{
PDFGetter pdf = new PDFGetter();
pdf.get();
System.out.println("My First Java Program.");
}
}