Skip to content

Commit e77024c

Browse files
for PDL, Download PDF if available
1 parent 008cb3e commit e77024c

File tree

2 files changed

+126
-40
lines changed

2 files changed

+126
-40
lines changed

bull/pdl-queue/consumer.js

Lines changed: 109 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ const _ = require("lodash");
77
const winston = require("winston");
88
const logger = winston.loggers.get("defaultLogger");
99
const { logUserData } = require("./../../utils/helper");
10+
const { customFetch } = require("../../utils/helper");
11+
const { Readable } = require("stream");
1012

1113
var JSZip = require("jszip");
1214
PDLQueue.on("active", (job, jobPromise) => {
@@ -66,7 +68,36 @@ async function getZipAndBytelength(no_of_pages, id, title, job) {
6668
return [zip, byteLength, errorFlag];
6769
}
6870

69-
function setHeaders(metadata, byteLength, title) {
71+
async function getPdfAndBytelength(pdfUrl, job) {
72+
try {
73+
const response = await customFetch(
74+
pdfUrl,
75+
"GET",
76+
new Headers({
77+
"Content-Type": "application/pdf",
78+
}),
79+
"pdf"
80+
);
81+
if (response.status === 200) {
82+
job.progress(30);
83+
const buffer = await response.buffer();
84+
job.progress(60);
85+
return {
86+
pdfBuffer: buffer,
87+
byteLength: buffer.byteLength,
88+
};
89+
} else {
90+
throw new Error(
91+
`Failed to download PDF. Status Code: ${response.status} `
92+
);
93+
}
94+
} catch (error) {
95+
console.error("Error:", error);
96+
return null;
97+
}
98+
}
99+
100+
function setHeaders(metadata, byteLength, title, contentType) {
70101
let headers = {};
71102
headers[
72103
"Authorization"
@@ -92,11 +123,16 @@ function setHeaders(metadata, byteLength, title) {
92123
return headers;
93124
}
94125

95-
async function uploadToIA(zip, metadata, byteLength, email, job) {
126+
async function uploadZipToIA(zip, metadata, byteLength, email, job) {
96127
const bucketTitle = metadata.IAIdentifier;
97128
const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}_images.zip`;
98129
metadata = _.omit(metadata, "coverImage");
99-
let headers = setHeaders(metadata, byteLength, metadata.title);
130+
let headers = setHeaders(
131+
metadata,
132+
byteLength,
133+
metadata.title,
134+
job.data.details.contentType
135+
);
100136
await zip.generateNodeStream({ type: "nodebuffer", streamFiles: true }).pipe(
101137
request(
102138
{
@@ -121,42 +157,78 @@ async function uploadToIA(zip, metadata, byteLength, email, job) {
121157
);
122158
}
123159

124-
PDLQueue.process(async (job, done) => {
125-
const jobLogs = job.data.details;
126-
const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
127-
jobLogs["trueURI"] = trueURI;
128-
jobLogs["userName"] = job.data.details.userName;
129-
const [zip, byteLength, errorFlag] = await getZipAndBytelength(
130-
job.data.details.Pages,
131-
job.data.details.bookID,
132-
job.data.details.title,
133-
job
160+
async function uploadPdfToIA(pdfBuffer, metadata, byteLength, email, job) {
161+
const bucketTitle = metadata.IAIdentifier;
162+
const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}.pdf`;
163+
let headers = setHeaders(
164+
metadata,
165+
byteLength,
166+
metadata.title,
167+
job.data.details.contentType
134168
);
135-
if (errorFlag.status) {
136-
job.log(JSON.stringify(jobLogs));
137-
logUserData(jobLogs["userName"], "Panjab Digital Library");
138-
logger.log({
139-
level: "error",
140-
message: `Upload to Internet Archive failed because ${errorFlag.page} is not reachable. Please try again or contact Panjab Digital Library for more details.`,
141-
});
142-
job.progress(100);
143-
done(
144-
new Error(
145-
`Upload to Internet Archive failed because <a href=${errorFlag.page} target='_blank'>${errorFlag.page}</a> is not reachable. Please try again or contact Panjab Digital Library for more details.`
146-
)
147-
);
148-
} else {
169+
const options = {
170+
method: "PUT",
171+
uri: IAuri,
172+
headers: headers,
173+
};
174+
const readableStream = Readable.from(pdfBuffer);
175+
readableStream.pipe(
176+
request(options, async (error, response, body) => {
177+
if (response.statusCode === 200) {
178+
// EmailProducer(email, metadata.title, IAuri, true);
179+
} else {
180+
logger.log({
181+
level: "error",
182+
message: `IA Failure PDL ${body}`,
183+
});
184+
}
185+
})
186+
);
187+
}
188+
189+
PDLQueue.process(async (job, done) => {
190+
try {
191+
const jobLogs = job.data.details;
192+
const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
193+
jobLogs["trueURI"] = trueURI;
194+
jobLogs["userName"] = job.data.details.userName;
149195
job.log(JSON.stringify(jobLogs));
150196
logUserData(jobLogs["userName"], "Panjab Digital Library");
151-
job.progress(90);
152-
await uploadToIA(
153-
zip,
154-
job.data.details,
155-
byteLength,
156-
job.data.details.email,
157-
job
158-
);
159-
job.progress(100);
160-
done(null, true);
197+
198+
if (job.data.details.pdfUrl) {
199+
const { pdfBuffer, byteLength } = await getPdfAndBytelength(
200+
job.data.details.pdfUrl,
201+
job
202+
);
203+
await uploadPdfToIA(
204+
pdfBuffer,
205+
job.data.details,
206+
byteLength,
207+
job.data.details.email,
208+
job
209+
);
210+
job.progress(100);
211+
done(null, true);
212+
} else {
213+
const [zip, byteLength] = await getZipAndBytelength(
214+
job.data.details.Pages,
215+
job.data.details.bookID,
216+
job.data.details.title,
217+
job
218+
);
219+
job.progress(90);
220+
await uploadZipToIA(
221+
zip,
222+
job.data.details,
223+
byteLength,
224+
job.data.details.email,
225+
job
226+
);
227+
job.progress(100);
228+
done(null, true);
229+
}
230+
} catch (error) {
231+
console.error("Error processing job:", error);
232+
done(new Error(error));
161233
}
162234
});

utils/helper.js

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,12 @@ module.exports = {
2525
return title.replace(/[ \(\)\[\],:]/g, "");
2626
},
2727

28-
customFetch: async (URI, method = "GET", headers = new Headers()) => {
28+
customFetch: async (
29+
URI,
30+
method = "GET",
31+
headers = new Headers(),
32+
contentType = "other"
33+
) => {
2934
return fetch(URI, {
3035
method: method,
3136
headers: headers,
@@ -34,7 +39,10 @@ module.exports = {
3439
(res) => {
3540
if (res.status === 404) {
3641
return 404;
37-
} else return res.json();
42+
} else {
43+
const result = contentType === "pdf" ? res : res.json();
44+
return result;
45+
}
3846
},
3947
(err) => {
4048
logger.log({
@@ -105,7 +113,8 @@ module.exports = {
105113
let PNdetails = {};
106114
const keys = $(".ubhypers");
107115
const values = $(".dhypers");
108-
116+
const downloadPdfLink = $("#downloadpdf a")[0]?.attribs.href;
117+
let contentType = "zip";
109118
function addOtherMetaData(limit, keys, values, PNdetails) {
110119
let value;
111120
for (let i = 0; i < values.length; i++) {
@@ -173,6 +182,11 @@ module.exports = {
173182
src = src.match(/pdl.*/gm);
174183
PNdetails.coverImage = `http://panjabdigilib.org/${src}`;
175184

185+
if (downloadPdfLink?.length) {
186+
contentType = "pdf";
187+
PNdetails.pdfUrl = `http://www.panjabdigilib.org/webuser/searches/${downloadPdfLink}`;
188+
}
189+
PNdetails.contentType = contentType;
176190
delete PNdetails[""];
177191

178192
return PNdetails;

0 commit comments

Comments
 (0)