for PDL, Download PDF if available

okerekechinweotito · okerekechinweotito · commit 5efd9a9556db · 2023-10-20T10:03:57.000+01:00
diff --git a/bull/pdl-queue/consumer.js b/bull/pdl-queue/consumer.js
@@ -7,6 +7,8 @@ const _ = require("lodash");
 const winston = require("winston");
 const logger = winston.loggers.get("defaultLogger");
 const { logUserData } = require("./../../utils/helper");
+const { customFetch } = require("../../utils/helper");
+const { Readable } = require("stream");
 
 var JSZip = require("jszip");
 PDLQueue.on("active", (job, jobPromise) => {
@@ -57,12 +59,41 @@ async function getZipAndBytelength(no_of_pages, id, title, job) {
   return [zip, byteLength];
 }
 
-function setHeaders(metadata, byteLength, title) {
+async function getPdfAndBytelength(pdfUrl, job) {
+  try {
+    const response = await customFetch(
+      pdfUrl,
+      "GET",
+      new Headers({
+        "Content-Type": "application/pdf",
+      }),
+      "pdf"
+    );
+    if (response.status === 200) {
+      job.progress(30);
+      const buffer = await response.buffer();
+      job.progress(60);
+      return {
+        pdfBuffer: buffer,
+        byteLength: buffer.byteLength,
+      };
+    } else {
+      throw new Error(
+        `Failed to download PDF. Status Code: ${response.status} `
+      );
+    }
+  } catch (error) {
+    console.error("Error:", error);
+    return null;
+  }
+}
+
+function setHeaders(metadata, byteLength, title, contentType) {
   let headers = {};
   headers[
     "Authorization"
   ] = `LOW ${process.env.access_key}:${process.env.secret_key}`;
-  headers["Content-type"] = "application/zip";
+  headers["Content-type"] = `application/${contentType}`;
   headers["Content-length"] = byteLength;
   headers["X-Amz-Auto-Make-Bucket"] = 1;
   headers["X-Archive-meta-collection"] = "opensource";
@@ -83,11 +114,16 @@ function setHeaders(metadata, byteLength, title) {
   return headers;
 }
 
-async function uploadToIA(zip, metadata, byteLength, email, job) {
+async function uploadZipToIA(zip, metadata, byteLength, email, job) {
   const bucketTitle = metadata.IAIdentifier;
   const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}_images.zip`;
   metadata = _.omit(metadata, "coverImage");
-  let headers = setHeaders(metadata, byteLength, metadata.title);
+  let headers = setHeaders(
+    metadata,
+    byteLength,
+    metadata.title,
+    job.data.details.contentType
+  );
   await zip.generateNodeStream({ type: "nodebuffer", streamFiles: true }).pipe(
     request(
       {
@@ -112,27 +148,78 @@ async function uploadToIA(zip, metadata, byteLength, email, job) {
   );
 }
 
-PDLQueue.process(async (job, done) => {
-  const jobLogs = job.data.details;
-  const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
-  jobLogs["trueURI"] = trueURI;
-  jobLogs["userName"] = job.data.details.userName;
-  job.log(JSON.stringify(jobLogs));
-  logUserData(jobLogs["userName"], "Panjab Digital Library");
-  const [zip, byteLength] = await getZipAndBytelength(
-    job.data.details.Pages,
-    job.data.details.bookID,
-    job.data.details.title,
-    job
-  );
-  job.progress(90);
-  await uploadToIA(
-    zip,
-    job.data.details,
+async function uploadPdfToIA(pdfBuffer, metadata, byteLength, email, job) {
+  const bucketTitle = metadata.IAIdentifier;
+  const IAuri = `http://s3.us.archive.org/${bucketTitle}/${bucketTitle}.pdf`;
+  let headers = setHeaders(
+    metadata,
     byteLength,
-    job.data.details.email,
-    job
+    metadata.title,
+    job.data.details.contentType
   );
-  job.progress(100);
-  done(null, true);
+  const options = {
+    method: "PUT",
+    uri: IAuri,
+    headers: headers,
+  };
+  const readableStream = Readable.from(pdfBuffer);
+  readableStream.pipe(
+    request(options, async (error, response, body) => {
+      if (response.statusCode === 200) {
+        // EmailProducer(email, metadata.title, IAuri, true);
+      } else {
+        logger.log({
+          level: "error",
+          message: `IA Failure PDL ${body}`,
+        });
+      }
+    })
+  );
+}
+
+PDLQueue.process(async (job, done) => {
+  try {
+    const jobLogs = job.data.details;
+    const trueURI = `http://archive.org/details/${job.data.details.IAIdentifier}`;
+    jobLogs["trueURI"] = trueURI;
+    jobLogs["userName"] = job.data.details.userName;
+    job.log(JSON.stringify(jobLogs));
+    logUserData(jobLogs["userName"], "Panjab Digital Library");
+
+    if (job.data.details.pdfUrl) {
+      const { pdfBuffer, byteLength } = await getPdfAndBytelength(
+        job.data.details.pdfUrl,
+        job
+      );
+      await uploadPdfToIA(
+        pdfBuffer,
+        job.data.details,
+        byteLength,
+        job.data.details.email,
+        job
+      );
+      job.progress(100);
+      done(null, true);
+    } else {
+      const [zip, byteLength] = await getZipAndBytelength(
+        job.data.details.Pages,
+        job.data.details.bookID,
+        job.data.details.title,
+        job
+      );
+      job.progress(90);
+      await uploadZipToIA(
+        zip,
+        job.data.details,
+        byteLength,
+        job.data.details.email,
+        job
+      );
+      job.progress(100);
+      done(null, true);
+    }
+  } catch (error) {
+    console.error("Error processing job:", error);
+    done(new Error(error));
+  }
 });
diff --git a/utils/helper.js b/utils/helper.js
@@ -25,7 +25,12 @@ module.exports = {
     return title.replace(/[ \(\)\[\],:]/g, "");
   },
 
-  customFetch: async (URI, method = "GET", headers = new Headers()) => {
+  customFetch: async (
+    URI,
+    method = "GET",
+    headers = new Headers(),
+    contentType = "other"
+  ) => {
     return fetch(URI, {
       method: method,
       headers: headers,
@@ -34,7 +39,10 @@ module.exports = {
         (res) => {
           if (res.status === 404) {
             return 404;
-          } else return res.json();
+          } else {
+            const result = contentType === "pdf" ? res : res.json();
+            return result;
+          }
         },
         (err) => {
           logger.log({
@@ -105,7 +113,8 @@ module.exports = {
     let PNdetails = {};
     const keys = $(".ubhypers");
     const values = $(".dhypers");
-
+    const downloadPdfLink = $("#downloadpdf a")[0]?.attribs.href;
+    let contentType = "zip";
     function addOtherMetaData(limit, keys, values, PNdetails) {
       let value;
       for (let i = 0; i < values.length; i++) {
@@ -173,6 +182,11 @@ module.exports = {
     src = src.match(/pdl.*/gm);
     PNdetails.coverImage = `http://panjabdigilib.org/${src}`;
 
+    if (downloadPdfLink?.length) {
+      contentType = "pdf";
+      PNdetails.pdfUrl = `http://www.panjabdigilib.org/webuser/searches/${downloadPdfLink}`;
+    }
+    PNdetails.contentType = contentType;
     delete PNdetails[""];
 
     return PNdetails;