Skip to content

Commit 5efd9a9

Browse files
for PDL, Download PDF if available
1 parent 84fbe81 commit 5efd9a9

File tree

2 files changed

+129
-28
lines changed

2 files changed

+129
-28
lines changed

bull/pdl-queue/consumer.js

Lines changed: 112 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ const _ = require("lodash");
77
const winston = require("winston");
88
const logger = winston.loggers.get("defaultLogger");
99
const { logUserData } = require("./../../utils/helper");
10+
const { customFetch } = require("../../utils/helper");
11+
const { Readable } = require("stream");
1012

1113
var JSZip = require("jszip");
1214
PDLQueue.on("active", (job, jobPromise) => {
@@ -57,12 +59,41 @@ async function getZipAndBytelength(no_of_pages, id, title, job) {
5759
return [zip, byteLength];
5860
}
5961

60-
function setHeaders(metadata, byteLength, title) {
62+
async function getPdfAndBytelength(pdfUrl, job) {
63+
try {
64+
const response = await customFetch(
65+
pdfUrl,
66+
"GET",
67+
new Headers({
68+
"Content-Type": "application/pdf",
69+
}),
70+
"pdf"
71+
);
72+
if (response.status === 200) {
73+
job.progress(30);
74+
const buffer = await response.buffer();
75+
job.progress(60);
76+
return {
77+
pdfBuffer: buffer,
78+
byteLength: buffer.byteLength,
79+
};
80+
} else {
81+
throw new Error(
82+
`Failed to download PDF. Status Code: ${response.status} `
83+
);
84+
}
85+
} catch (error) {
86+
console.error("Error:", error);
87+
return null;
88+
}
89+
}
90+
91+
function setHeaders(metadata, byteLength, title, contentType) {
6192
let headers = {};
6293
headers[
6394
"Authorization"
6495
] = `LOW ${process.env.access_key}:${process.env.secret_key}`;
65-
headers["Content-type"] = "application/zip";
96+
headers["Content-type"] = `application/${contentType}`;
6697
headers["Content-length"] = byteLength;
6798
headers["X-Amz-Auto-Make-Bucket"] = 1;
6899
headers["X-Archive-meta-collection"] = "opensource";
@@ -83,11 +114,16 @@ function setHeaders(metadata, byteLength, title) {
83114
return headers;
84115
}
85116

86-
async function uploadToIA(zip, metadata, byteLength, email, job) {
117+
async function uploadZipToIA(zip, metadata, byteLength, email, job) {
87118
const bucketTitle = metadata.IAIdentifier;
88119
const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}_images.zip`;
89120
metadata = _.omit(metadata, "coverImage");
90-
let headers = setHeaders(metadata, byteLength, metadata.title);
121+
let headers = setHeaders(
122+
metadata,
123+
byteLength,
124+
metadata.title,
125+
job.data.details.contentType
126+
);
91127
await zip.generateNodeStream({ type: "nodebuffer", streamFiles: true }).pipe(
92128
request(
93129
{
@@ -112,27 +148,78 @@ async function uploadToIA(zip, metadata, byteLength, email, job) {
112148
);
113149
}
114150

115-
PDLQueue.process(async (job, done) => {
116-
const jobLogs = job.data.details;
117-
const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
118-
jobLogs["trueURI"] = trueURI;
119-
jobLogs["userName"] = job.data.details.userName;
120-
job.log(JSON.stringify(jobLogs));
121-
logUserData(jobLogs["userName"], "Panjab Digital Library");
122-
const [zip, byteLength] = await getZipAndBytelength(
123-
job.data.details.Pages,
124-
job.data.details.bookID,
125-
job.data.details.title,
126-
job
127-
);
128-
job.progress(90);
129-
await uploadToIA(
130-
zip,
131-
job.data.details,
151+
async function uploadPdfToIA(pdfBuffer, metadata, byteLength, email, job) {
152+
const bucketTitle = metadata.IAIdentifier;
153+
const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}.pdf`;
154+
let headers = setHeaders(
155+
metadata,
132156
byteLength,
133-
job.data.details.email,
134-
job
157+
metadata.title,
158+
job.data.details.contentType
135159
);
136-
job.progress(100);
137-
done(null, true);
160+
const options = {
161+
method: "PUT",
162+
uri: IAuri,
163+
headers: headers,
164+
};
165+
const readableStream = Readable.from(pdfBuffer);
166+
readableStream.pipe(
167+
request(options, async (error, response, body) => {
168+
if (response.statusCode === 200) {
169+
// EmailProducer(email, metadata.title, IAuri, true);
170+
} else {
171+
logger.log({
172+
level: "error",
173+
message: `IA Failure PDL ${body}`,
174+
});
175+
}
176+
})
177+
);
178+
}
179+
180+
PDLQueue.process(async (job, done) => {
181+
try {
182+
const jobLogs = job.data.details;
183+
const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
184+
jobLogs["trueURI"] = trueURI;
185+
jobLogs["userName"] = job.data.details.userName;
186+
job.log(JSON.stringify(jobLogs));
187+
logUserData(jobLogs["userName"], "Panjab Digital Library");
188+
189+
if (job.data.details.pdfUrl) {
190+
const { pdfBuffer, byteLength } = await getPdfAndBytelength(
191+
job.data.details.pdfUrl,
192+
job
193+
);
194+
await uploadPdfToIA(
195+
pdfBuffer,
196+
job.data.details,
197+
byteLength,
198+
job.data.details.email,
199+
job
200+
);
201+
job.progress(100);
202+
done(null, true);
203+
} else {
204+
const [zip, byteLength] = await getZipAndBytelength(
205+
job.data.details.Pages,
206+
job.data.details.bookID,
207+
job.data.details.title,
208+
job
209+
);
210+
job.progress(90);
211+
await uploadZipToIA(
212+
zip,
213+
job.data.details,
214+
byteLength,
215+
job.data.details.email,
216+
job
217+
);
218+
job.progress(100);
219+
done(null, true);
220+
}
221+
} catch (error) {
222+
console.error("Error processing job:", error);
223+
done(new Error(error));
224+
}
138225
});

utils/helper.js

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@ module.exports = {
2525
return title.replace(/[ \(\)\[\],:]/g, "");
2626
},
2727

28-
customFetch: async (URI, method = "GET", headers = new Headers()) => {
28+
customFetch: async (
29+
URI,
30+
method = "GET",
31+
headers = new Headers(),
32+
contentType = "other"
33+
) => {
2934
return fetch(URI, {
3035
method: method,
3136
headers: headers,
@@ -34,7 +39,10 @@ module.exports = {
3439
(res) => {
3540
if (res.status === 404) {
3641
return 404;
37-
} else return res.json();
42+
} else {
43+
const result = contentType === "pdf" ? res : res.json();
44+
return result;
45+
}
3846
},
3947
(err) => {
4048
logger.log({
@@ -105,7 +113,8 @@ module.exports = {
105113
let PNdetails = {};
106114
const keys = $(".ubhypers");
107115
const values = $(".dhypers");
108-
116+
const downloadPdfLink = $("#downloadpdf a")[0]?.attribs.href;
117+
let contentType = "zip";
109118
function addOtherMetaData(limit, keys, values, PNdetails) {
110119
let value;
111120
for (let i = 0; i < values.length; i++) {
@@ -173,6 +182,11 @@ module.exports = {
173182
src = src.match(/pdl.*/gm);
174183
PNdetails.coverImage = `http://panjabdigilib.org/${src}`;
175184

185+
if (downloadPdfLink?.length) {
186+
contentType = "pdf";
187+
PNdetails.pdfUrl = `http://www.panjabdigilib.org/webuser/searches/${downloadPdfLink}`;
188+
}
189+
PNdetails.contentType = contentType;
176190
delete PNdetails[""];
177191

178192
return PNdetails;

0 commit comments

Comments
 (0)