Skip to content

Commit f9e1f8e

Browse files
Merge pull request #224 from okerekechinweotito/feat/pdl-download-pdf
Fixes:[T348188] For PDL, download and stream the PDF if available
2 parents 3b0f921 + 14e9cc3 commit f9e1f8e

File tree

2 files changed

+184
-44
lines changed

2 files changed

+184
-44
lines changed

bull/pdl-queue/consumer.js

Lines changed: 164 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ const _ = require("lodash");
77
const winston = require("winston");
88
const logger = winston.loggers.get("defaultLogger");
99
const { logUserData } = require("./../../utils/helper");
10+
const { customFetch } = require("../../utils/helper");
11+
const { Readable } = require("stream");
1012

1113
var JSZip = require("jszip");
1214
PDLQueue.on("active", (job, jobPromise) => {
@@ -66,12 +68,58 @@ async function getZipAndBytelength(no_of_pages, id, title, job) {
6668
return [zip, byteLength, errorFlag];
6769
}
6870

69-
function setHeaders(metadata, byteLength, title) {
71+
async function getPdfAndBytelength(pdfUrl, job) {
72+
try {
73+
let errorFlag = { status: false, page: "" };
74+
const response = await customFetch(
75+
pdfUrl,
76+
"GET",
77+
new Headers({
78+
"Content-Type": "application/pdf",
79+
}),
80+
"file"
81+
);
82+
if (response.status === 200) {
83+
job.progress(30);
84+
const buffer = await response.buffer();
85+
job.progress(60);
86+
return {
87+
pdfBuffer: buffer,
88+
byteLength: buffer.byteLength,
89+
errorFlag,
90+
};
91+
} else {
92+
logger.log({
93+
level: "error",
94+
message: `Failure PDL: Failed to download PDF. Status Code: ${response.status}`,
95+
});
96+
errorFlag = { status: true, page: pdfUrl };
97+
return {
98+
pdfBuffer: null,
99+
byteLength: null,
100+
errorFlag,
101+
};
102+
}
103+
} catch (error) {
104+
logger.log({
105+
level: "error",
106+
message: `Failure PDL: ${error}`,
107+
});
108+
let errorFlag = { status: true, page: pdfUrl };
109+
return {
110+
pdfBuffer: null,
111+
byteLength: null,
112+
errorFlag,
113+
};
114+
}
115+
}
116+
117+
function setHeaders(metadata, byteLength, title, contentType) {
70118
let headers = {};
71119
headers[
72120
"Authorization"
73121
] = `LOW ${process.env.access_key}:${process.env.secret_key}`;
74-
headers["Content-type"] = "application/zip";
122+
headers["Content-type"] = `application/${contentType}`;
75123
headers["Content-length"] = byteLength;
76124
headers["X-Amz-Auto-Make-Bucket"] = 1;
77125
headers["X-Archive-meta-collection"] = "opensource";
@@ -89,14 +137,22 @@ function setHeaders(metadata, byteLength, title) {
89137
headers[`X-archive-meta-${meta_key}`] = metadata[key];
90138
}
91139
headers["X-archive-meta-title"] = metadata["title"];
140+
headers[`X-archive-meta-description`] = `uri(${encodeURI(
141+
metadata.description?.trim()
142+
)})`;
92143
return headers;
93144
}
94145

95-
async function uploadToIA(zip, metadata, byteLength, email, job) {
146+
async function uploadZipToIA(zip, metadata, byteLength, email, job, onError) {
96147
const bucketTitle = metadata.IAIdentifier;
97148
const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}_images.zip`;
98149
metadata = _.omit(metadata, "coverImage");
99-
let headers = setHeaders(metadata, byteLength, metadata.title);
150+
let headers = setHeaders(
151+
metadata,
152+
byteLength,
153+
metadata.title,
154+
job.data.details.contentType
155+
);
100156
await zip.generateNodeStream({ type: "nodebuffer", streamFiles: true }).pipe(
101157
request(
102158
{
@@ -115,13 +171,13 @@ async function uploadToIA(zip, metadata, byteLength, email, job) {
115171
level: "error",
116172
message: `IA Failure PDL ${error}`,
117173
});
118-
done(new Error(error));
174+
onError(true, error);
119175
} else {
120176
logger.log({
121177
level: "error",
122178
message: `IA Failure PDL ${body}`,
123179
});
124-
done(new Error(body));
180+
onError(true, body);
125181
}
126182
//EmailProducer(email, metadata.title, trueURI, false);
127183
}
@@ -130,42 +186,110 @@ async function uploadToIA(zip, metadata, byteLength, email, job) {
130186
);
131187
}
132188

133-
PDLQueue.process(async (job, done) => {
134-
const jobLogs = job.data.details;
135-
const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
136-
jobLogs["trueURI"] = trueURI;
137-
jobLogs["userName"] = job.data.details.userName;
138-
const [zip, byteLength, errorFlag] = await getZipAndBytelength(
139-
job.data.details.Pages,
140-
job.data.details.bookID,
141-
job.data.details.title,
142-
job
189+
async function uploadPdfToIA(
190+
pdfBuffer,
191+
metadata,
192+
byteLength,
193+
email,
194+
job,
195+
onError
196+
) {
197+
const bucketTitle = metadata.IAIdentifier;
198+
const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}.pdf`;
199+
let headers = setHeaders(
200+
metadata,
201+
byteLength,
202+
metadata.title,
203+
job.data.details.contentType
143204
);
144-
if (errorFlag.status) {
145-
job.log(JSON.stringify(jobLogs));
146-
logUserData(jobLogs["userName"], "Panjab Digital Library");
147-
logger.log({
148-
level: "error",
149-
message: `Upload to Internet Archive failed because ${errorFlag.page} is not reachable. Please try again or contact Panjab Digital Library for more details.`,
150-
});
151-
job.progress(100);
152-
done(
153-
new Error(
154-
`Upload to Internet Archive failed because <a href=${errorFlag.page} target='_blank'>${errorFlag.page}</a> is not reachable. Please try again or contact Panjab Digital Library for more details.`
155-
)
156-
);
157-
} else {
205+
const options = {
206+
method: "PUT",
207+
uri: IAuri,
208+
headers: headers,
209+
};
210+
const readableStream = Readable.from(pdfBuffer);
211+
readableStream.pipe(
212+
request(options, (error, response, body) => {
213+
if (response.statusCode === 200) {
214+
// EmailProducer(email, metadata.title, IAuri, true);
215+
} else {
216+
logger.log({
217+
level: "error",
218+
message: `IA Failure PDL ${body || error}`,
219+
});
220+
onError(true, body || error);
221+
}
222+
})
223+
);
224+
}
225+
226+
PDLQueue.process(async (job, done) => {
227+
try {
228+
const jobLogs = job.data.details;
229+
const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
230+
jobLogs["trueURI"] = trueURI;
231+
jobLogs["userName"] = job.data.details.userName;
158232
job.log(JSON.stringify(jobLogs));
159233
logUserData(jobLogs["userName"], "Panjab Digital Library");
160-
job.progress(90);
161-
await uploadToIA(
162-
zip,
163-
job.data.details,
164-
byteLength,
165-
job.data.details.email,
166-
job
167-
);
168-
job.progress(100);
169-
done(null, true);
234+
235+
if (job.data.details.pdfUrl) {
236+
const { pdfBuffer, byteLength, errorFlag } = await getPdfAndBytelength(
237+
job.data.details.pdfUrl,
238+
job
239+
);
240+
if (errorFlag.status) {
241+
logger.log({
242+
level: "error",
243+
message: `Failure PDL: Failed to download ${errorFlag.page}`,
244+
});
245+
done(new Error(`Failure PDL: Failed to download ${errorFlag.page}`));
246+
}
247+
await uploadPdfToIA(
248+
pdfBuffer,
249+
job.data.details,
250+
byteLength,
251+
job.data.details.email,
252+
job,
253+
(isError, error) => {
254+
if (isError) {
255+
done(new Error(error));
256+
}
257+
}
258+
);
259+
job.progress(100);
260+
done(null, true);
261+
} else {
262+
const [zip, byteLength, errorFlag] = await getZipAndBytelength(
263+
job.data.details.Pages,
264+
job.data.details.bookID,
265+
job.data.details.title,
266+
job
267+
);
268+
if (errorFlag.status) {
269+
logger.log({
270+
level: "error",
271+
message: `Failure PDL: Failed to download ${errorFlag.page}`,
272+
});
273+
done(new Error(`Failure PDL: Failed to download ${errorFlag.page}`));
274+
}
275+
job.progress(90);
276+
await uploadZipToIA(
277+
zip,
278+
job.data.details,
279+
byteLength,
280+
job.data.details.email,
281+
job,
282+
(isError, error) => {
283+
if (isError) {
284+
done(new Error(error));
285+
}
286+
}
287+
);
288+
job.progress(100);
289+
done(null, true);
290+
}
291+
} catch (error) {
292+
console.error("Error processing job:", error);
293+
done(new Error(error));
170294
}
171295
});

utils/helper.js

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@ module.exports = {
2525
return title.replace(/[ \(\)\[\],:]/g, "");
2626
},
2727

28-
customFetch: async (URI, method = "GET", headers = new Headers()) => {
28+
customFetch: async (
29+
URI,
30+
method = "GET",
31+
headers = new Headers(),
32+
contentType = "other"
33+
) => {
2934
return fetch(URI, {
3035
method: method,
3136
headers: headers,
@@ -34,7 +39,10 @@ module.exports = {
3439
(res) => {
3540
if (res.status === 404) {
3641
return 404;
37-
} else return res.json();
42+
} else {
43+
const result = contentType === "file" ? res : res.json();
44+
return result;
45+
}
3846
},
3947
(err) => {
4048
logger.log({
@@ -105,7 +113,10 @@ module.exports = {
105113
let PNdetails = {};
106114
const keys = $(".ubhypers");
107115
const values = $(".dhypers");
108-
116+
const downloadPdfLink = $("#downloadpdf a")[0]?.attribs.href;
117+
let pagesLabel = $(".ubhypers:contains('Pages')");
118+
let pagesValue = pagesLabel.parent().next().find(".dhypers").text();
119+
let contentType = "zip";
109120
function addOtherMetaData(limit, keys, values, PNdetails) {
110121
let value;
111122
for (let i = 0; i < values.length; i++) {
@@ -173,8 +184,13 @@ module.exports = {
173184
src = src.match(/pdl.*/gm);
174185
PNdetails.coverImage = `http://panjabdigilib.org/${src}`;
175186

187+
if (downloadPdfLink?.length) {
188+
contentType = "pdf";
189+
PNdetails.pdfUrl = `http://www.panjabdigilib.org/webuser/searches/${downloadPdfLink}`;
190+
}
191+
PNdetails.contentType = contentType;
192+
PNdetails.Pages = pagesValue;
176193
delete PNdetails[""];
177-
178194
return PNdetails;
179195
},
180196

0 commit comments

Comments
 (0)