From 736a023262dbcef625d5da26560ad1399d69545c Mon Sep 17 00:00:00 2001 From: "l.kaesberg" Date: Thu, 28 Mar 2024 11:53:10 +0100 Subject: [PATCH 1/4] feat: backend can process files with grobid --- backend/app.js | 3 + backend/package-lock.json | 168 +++++++++++++++++++++++++++++++++++++- backend/package.json | 5 +- backend/routes/extract.js | 52 ++++++++++++ docker-compose.dev.yml | 2 +- docker-compose.yml | 2 +- 6 files changed, 228 insertions(+), 4 deletions(-) create mode 100644 backend/routes/extract.js diff --git a/backend/app.js b/backend/app.js index ad39705..9cbdb9f 100644 --- a/backend/app.js +++ b/backend/app.js @@ -8,6 +8,7 @@ const indexRouter = require('./routes/index'); const usersRouter = require('./routes/users'); const testAPIRouter = require("./routes/testAPI"); const databaseRouter = require("./routes/database"); +const extractRouter = require("./routes/extract"); const bodyParser = require('body-parser'); const config = require("./config.json"); @@ -40,6 +41,8 @@ app.use('/', indexRouter); app.use('/users', usersRouter); app.use("/testAPI", testAPIRouter); app.use("/database", databaseRouter); +app.use("/extract", extractRouter); + // catch 404 and forward to error handler app.use(function (req, res, next) { diff --git a/backend/package-lock.json b/backend/package-lock.json index 36f9f0b..0081545 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -8,18 +8,21 @@ "name": "backend", "version": "0.0.0", "dependencies": { + "axios": "^1.6.8", "body-parser": "^1.20.2", "cookie-parser": "~1.4.4", "cors": "^2.8.5", "debug": "~2.6.9", "express": "~4.18.2", + "form-data": "^4.0.0", "http-errors": "~1.6.3", "jade": "^1.9.2", "morgan": "~1.10.0", "multer": "^1.4.5-lts.1", "pg": "^8.8.0", "wink-eng-lite-web-model": "^1.5.2", - "wink-nlp": "^1.14.3" + "wink-nlp": "^1.14.3", + "xml2js": "^0.6.2" }, "devDependencies": { "nodemon": "^3.0.2" @@ -93,6 +96,21 @@ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, + "node_modules/axios": { + "version": "1.6.8", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", + "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -293,6 +311,17 @@ "fsevents": "~2.3.2" } }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/commander": { "version": "2.6.0", "resolved": "https://registry.npmjs.org/commander/-/commander-2.6.0.tgz", @@ -450,6 +479,14 @@ "node": ">= 0.4" } }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/depd": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", @@ -698,6 +735,38 @@ "node": ">= 0.8" } }, + "node_modules/follow-redirects": { + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -1356,6 +1425,11 @@ "node": ">= 0.10" } }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, "node_modules/pstree.remy": { "version": "1.1.8", "resolved": "https://registry.npmjs.org/pstree.remy/-/pstree.remy-1.1.8.tgz", @@ -1475,6 +1549,11 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "node_modules/sax": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz", + "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==" + }, "node_modules/semver": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", @@ -1858,6 +1937,26 @@ "node": ">=0.4.0" } }, + "node_modules/xml2js": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.6.2.tgz", + "integrity": "sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==", + "dependencies": { + "sax": ">=0.6.0", + "xmlbuilder": "~11.0.0" + }, + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/xmlbuilder": { + "version": "11.0.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", + "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==", + "engines": { + "node": ">=4.0" + } + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", @@ -1927,6 +2026,21 @@ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" }, + "asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, + "axios": { + "version": "1.6.8", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", + "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", + "requires": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, "balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -2081,6 +2195,14 @@ "readdirp": "~3.6.0" } }, + "combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "requires": { + "delayed-stream": "~1.0.0" + } + }, "commander": { "version": "2.6.0", "resolved": "https://registry.npmjs.org/commander/-/commander-2.6.0.tgz", @@ -2201,6 +2323,11 @@ "has-property-descriptors": "^1.0.0" } }, + "delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==" + }, "depd": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", @@ -2389,6 +2516,21 @@ } } }, + "follow-redirects": { + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==" + }, + "form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + } + }, "forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -2856,6 +2998,11 @@ "ipaddr.js": "1.9.1" } }, + "proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, "pstree.remy": { "version": "1.1.8", "resolved": "https://registry.npmjs.org/pstree.remy/-/pstree.remy-1.1.8.tgz", @@ -2953,6 +3100,11 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "sax": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz", + "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==" + }, "semver": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", @@ -3249,6 +3401,20 @@ "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.2.tgz", "integrity": "sha512-xSBsCeh+g+dinoBv3GAOWM4LcVVO68wLXRanibtBSdUvkGWQRGeE9P7IwU9EmDDi4jA6L44lz15CGMwdw9N5+Q==" }, + "xml2js": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.6.2.tgz", + "integrity": "sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==", + "requires": { + "sax": ">=0.6.0", + "xmlbuilder": "~11.0.0" + } + }, + "xmlbuilder": { + "version": "11.0.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", + "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==" + }, "xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", diff --git a/backend/package.json b/backend/package.json index bdd8f36..35ac8c9 100644 --- a/backend/package.json +++ b/backend/package.json @@ -8,18 +8,21 @@ "dev-docker": "nodemon --legacy-watch ./bin/www" }, "dependencies": { + "axios": "^1.6.8", "body-parser": "^1.20.2", "cookie-parser": "~1.4.4", "cors": "^2.8.5", "debug": "~2.6.9", "express": "~4.18.2", + "form-data": "^4.0.0", "http-errors": "~1.6.3", "jade": "^1.9.2", "morgan": "~1.10.0", "multer": "^1.4.5-lts.1", "pg": "^8.8.0", "wink-eng-lite-web-model": "^1.5.2", - "wink-nlp": "^1.14.3" + "wink-nlp": "^1.14.3", + "xml2js": "^0.6.2" }, "devDependencies": { "nodemon": "^3.0.2" diff --git a/backend/routes/extract.js b/backend/routes/extract.js new file mode 100644 index 0000000..74467f9 --- /dev/null +++ b/backend/routes/extract.js @@ -0,0 +1,52 @@ +const express = require('express'); +const axios = require('axios'); +const FormData = require('form-data'); +const multer = require('multer'); // Import multer +const xml2js = require('xml2js'); +const router = express.Router(); + +// Set up multer to parse form-data requests. This configuration doesn't store files to disk. +const upload = multer({ storage: multer.memoryStorage() }); + +// Change from router.get to router.post and use the upload middleware to handle the file upload. +router.post("/", upload.single('file'), async (req, res) => { + // Ensure a file is actually provided + if (!req.file) { + return res.status(400).send('No file uploaded.'); + } + + // Create a new FormData instance and append the uploaded file + const formData = new FormData(); + formData.append('input', req.file.buffer, { + filename: req.file.originalname, + contentType: req.file.mimetype, + knownLength: req.file.size, + }); + + try { + // Adjust the URL if necessary for your GROBID server + const grobidResponse = await axios.post('http://grobid:8070/api/processFulltextDocument', formData, { + headers: { + ...formData.getHeaders(), + }, + responseType: 'text', // Changed to 'text' to handle XML response correctly + }); + + // Convert XML response to JSON + xml2js.parseString(grobidResponse.data, (err, result) => { + if (err) { + console.error('Error parsing XML: ', err); + return res.status(500).send('Error parsing XML response'); + } + + // `result` is a JavaScript object. Send it back as JSON. + res.json(result); + }); + + } catch (error) { + console.error('Error when calling GROBID: ', error); + res.status(500).send('Error when processing the PDF file'); + } +}); + +module.exports = router; \ No newline at end of file diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index f825894..057a287 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -42,7 +42,7 @@ services: - backend grobid: - image: lfoppiano/grobid:0.7.0 + image: lfoppiano/grobid:0.8.0 ports: - "8070:8070" - "8071:8071" diff --git a/docker-compose.yml b/docker-compose.yml index 785a3a1..658d268 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,7 +36,7 @@ services: - backend grobid: - image: lfoppiano/grobid:0.7.0 + image: lfoppiano/grobid:0.8.0 ports: - "8070:8070" - "8071:8071" From a565f7dbe903a68b2ed8a9ec8fa54c7f03d4e9ae Mon Sep 17 00:00:00 2001 From: "l.kaesberg" Date: Thu, 28 Mar 2024 11:53:10 +0100 Subject: [PATCH 2/4] feat: backend can process files with grobid --- backend/app.js | 3 + backend/package-lock.json | 168 +++++++++++++++++++++++++++++++++++++- backend/package.json | 5 +- backend/routes/extract.js | 47 +++++++++++ docker-compose.dev.yml | 2 +- docker-compose.yml | 2 +- 6 files changed, 223 insertions(+), 4 deletions(-) create mode 100644 backend/routes/extract.js diff --git a/backend/app.js b/backend/app.js index ad39705..9cbdb9f 100644 --- a/backend/app.js +++ b/backend/app.js @@ -8,6 +8,7 @@ const indexRouter = require('./routes/index'); const usersRouter = require('./routes/users'); const testAPIRouter = require("./routes/testAPI"); const databaseRouter = require("./routes/database"); +const extractRouter = require("./routes/extract"); const bodyParser = require('body-parser'); const config = require("./config.json"); @@ -40,6 +41,8 @@ app.use('/', indexRouter); app.use('/users', usersRouter); app.use("/testAPI", testAPIRouter); app.use("/database", databaseRouter); +app.use("/extract", extractRouter); + // catch 404 and forward to error handler app.use(function (req, res, next) { diff --git a/backend/package-lock.json b/backend/package-lock.json index 36f9f0b..0081545 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -8,18 +8,21 @@ "name": "backend", "version": "0.0.0", "dependencies": { + "axios": "^1.6.8", "body-parser": "^1.20.2", "cookie-parser": "~1.4.4", "cors": "^2.8.5", "debug": "~2.6.9", "express": "~4.18.2", + "form-data": "^4.0.0", "http-errors": "~1.6.3", "jade": "^1.9.2", "morgan": "~1.10.0", "multer": "^1.4.5-lts.1", "pg": "^8.8.0", "wink-eng-lite-web-model": "^1.5.2", - "wink-nlp": "^1.14.3" + "wink-nlp": "^1.14.3", + "xml2js": "^0.6.2" }, "devDependencies": { "nodemon": "^3.0.2" @@ -93,6 +96,21 @@ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, + "node_modules/axios": { + "version": "1.6.8", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", + "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, "node_modules/balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -293,6 +311,17 @@ "fsevents": "~2.3.2" } }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, "node_modules/commander": { "version": "2.6.0", "resolved": "https://registry.npmjs.org/commander/-/commander-2.6.0.tgz", @@ -450,6 +479,14 @@ "node": ">= 0.4" } }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/depd": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", @@ -698,6 +735,38 @@ "node": ">= 0.8" } }, + "node_modules/follow-redirects": { + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -1356,6 +1425,11 @@ "node": ">= 0.10" } }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, "node_modules/pstree.remy": { "version": "1.1.8", "resolved": "https://registry.npmjs.org/pstree.remy/-/pstree.remy-1.1.8.tgz", @@ -1475,6 +1549,11 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "node_modules/sax": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz", + "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==" + }, "node_modules/semver": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", @@ -1858,6 +1937,26 @@ "node": ">=0.4.0" } }, + "node_modules/xml2js": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.6.2.tgz", + "integrity": "sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==", + "dependencies": { + "sax": ">=0.6.0", + "xmlbuilder": "~11.0.0" + }, + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/xmlbuilder": { + "version": "11.0.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", + "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==", + "engines": { + "node": ">=4.0" + } + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", @@ -1927,6 +2026,21 @@ "resolved": "https://registry.npmjs.org/array-flatten/-/array-flatten-1.1.1.tgz", "integrity": "sha512-PCVAQswWemu6UdxsDFFX/+gVeYqKAod3D3UVm91jHwynguOwAvYPhx8nNlM++NqRcK6CxxpUafjmhIdKiHibqg==" }, + "asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, + "axios": { + "version": "1.6.8", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.6.8.tgz", + "integrity": "sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==", + "requires": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, "balanced-match": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", @@ -2081,6 +2195,14 @@ "readdirp": "~3.6.0" } }, + "combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "requires": { + "delayed-stream": "~1.0.0" + } + }, "commander": { "version": "2.6.0", "resolved": "https://registry.npmjs.org/commander/-/commander-2.6.0.tgz", @@ -2201,6 +2323,11 @@ "has-property-descriptors": "^1.0.0" } }, + "delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==" + }, "depd": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", @@ -2389,6 +2516,21 @@ } } }, + "follow-redirects": { + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==" + }, + "form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + } + }, "forwarded": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", @@ -2856,6 +2998,11 @@ "ipaddr.js": "1.9.1" } }, + "proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, "pstree.remy": { "version": "1.1.8", "resolved": "https://registry.npmjs.org/pstree.remy/-/pstree.remy-1.1.8.tgz", @@ -2953,6 +3100,11 @@ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" }, + "sax": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.3.0.tgz", + "integrity": "sha512-0s+oAmw9zLl1V1cS9BtZN7JAd0cW5e0QH4W3LWEK6a4LaLEA2OTpGYWDY+6XasBLtz6wkm3u1xRw95mRuJ59WA==" + }, "semver": { "version": "7.5.4", "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.4.tgz", @@ -3249,6 +3401,20 @@ "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.2.tgz", "integrity": "sha512-xSBsCeh+g+dinoBv3GAOWM4LcVVO68wLXRanibtBSdUvkGWQRGeE9P7IwU9EmDDi4jA6L44lz15CGMwdw9N5+Q==" }, + "xml2js": { + "version": "0.6.2", + "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.6.2.tgz", + "integrity": "sha512-T4rieHaC1EXcES0Kxxj4JWgaUQHDk+qwHcYOCFHfiwKz7tOVPLq7Hjq9dM1WCMhylqMEfP7hMcOIChvotiZegA==", + "requires": { + "sax": ">=0.6.0", + "xmlbuilder": "~11.0.0" + } + }, + "xmlbuilder": { + "version": "11.0.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", + "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==" + }, "xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", diff --git a/backend/package.json b/backend/package.json index bdd8f36..35ac8c9 100644 --- a/backend/package.json +++ b/backend/package.json @@ -8,18 +8,21 @@ "dev-docker": "nodemon --legacy-watch ./bin/www" }, "dependencies": { + "axios": "^1.6.8", "body-parser": "^1.20.2", "cookie-parser": "~1.4.4", "cors": "^2.8.5", "debug": "~2.6.9", "express": "~4.18.2", + "form-data": "^4.0.0", "http-errors": "~1.6.3", "jade": "^1.9.2", "morgan": "~1.10.0", "multer": "^1.4.5-lts.1", "pg": "^8.8.0", "wink-eng-lite-web-model": "^1.5.2", - "wink-nlp": "^1.14.3" + "wink-nlp": "^1.14.3", + "xml2js": "^0.6.2" }, "devDependencies": { "nodemon": "^3.0.2" diff --git a/backend/routes/extract.js b/backend/routes/extract.js new file mode 100644 index 0000000..34627a5 --- /dev/null +++ b/backend/routes/extract.js @@ -0,0 +1,47 @@ +const express = require('express'); +const axios = require('axios'); +const FormData = require('form-data'); +const multer = require('multer'); // Import multer +const xml2js = require('xml2js'); +const router = express.Router(); + +const upload = multer({ storage: multer.memoryStorage() }); + +router.post("/", upload.single('file'), async (req, res) => { + // Ensure a file is actually provided + if (!req.file) { + return res.status(400).send('No file uploaded.'); + } + + const formData = new FormData(); + formData.append('input', req.file.buffer, { + filename: req.file.originalname, + contentType: req.file.mimetype, + knownLength: req.file.size, + }); + + try { + const grobidResponse = await axios.post('http://grobid:8070/api/processFulltextDocument', formData, { + headers: { + ...formData.getHeaders(), + }, + responseType: 'text', // Changed to 'text' to handle XML response correctly + }); + + // Convert XML response to JSON + xml2js.parseString(grobidResponse.data, (err, result) => { + if (err) { + console.error('Error parsing XML: ', err); + return res.status(500).send('Error parsing XML response'); + } + + res.json(result); + }); + + } catch (error) { + console.error('Error when calling GROBID: ', error); + res.status(500).send('Error when processing the PDF file'); + } +}); + +module.exports = router; \ No newline at end of file diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index f825894..057a287 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -42,7 +42,7 @@ services: - backend grobid: - image: lfoppiano/grobid:0.7.0 + image: lfoppiano/grobid:0.8.0 ports: - "8070:8070" - "8071:8071" diff --git a/docker-compose.yml b/docker-compose.yml index 785a3a1..658d268 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,7 +36,7 @@ services: - backend grobid: - image: lfoppiano/grobid:0.7.0 + image: lfoppiano/grobid:0.8.0 ports: - "8070:8070" - "8071:8071" From d31742aa3fcef456dacb06b02ca0fa96b86e9e23 Mon Sep 17 00:00:00 2001 From: "l.kaesberg" Date: Thu, 28 Mar 2024 16:05:09 +0100 Subject: [PATCH 3/4] feat: import grobid data to frontend --- frontend/src/EnhancedPreprintGenerator.tsx | 23 +++++--- frontend/src/pdf/PDFParser.ts | 66 +++++++++++++++++++--- 2 files changed, 72 insertions(+), 17 deletions(-) diff --git a/frontend/src/EnhancedPreprintGenerator.tsx b/frontend/src/EnhancedPreprintGenerator.tsx index a8f830a..502069d 100644 --- a/frontend/src/EnhancedPreprintGenerator.tsx +++ b/frontend/src/EnhancedPreprintGenerator.tsx @@ -12,8 +12,9 @@ import {v4 as uuidv4} from 'uuid'; import config from "./config.json" import darkTheme from "./theme"; import {downloadLatexFiles} from "./latex/GenerateLatexFiles"; +import {CircularProgress} from "@mui/material"; -const backendURL = process.env.REACT_APP_BACKEND_URL || config.backend_url; +const backendURL = process.env.REACT_APP_BACKEND_URL || config.backend_url; const PDFJS = window.pdfjsLib; @@ -23,6 +24,7 @@ interface AppProps { interface AppState { apiConnected?: boolean; file?: PDFFile; + loading: boolean; } @@ -77,7 +79,7 @@ export async function requestPreprints(title: string, keywords: string[]) { class EnhancedPreprintGenerator extends Component { constructor(props: AppProps) { super(props); - this.state = {apiConnected: false}; + this.state = {apiConnected: false, loading: false}; } callAPI() { @@ -138,23 +140,28 @@ class EnhancedPreprintGenerator extends Component { }} />
- {!this.state.file && - { + {(!this.state.file && !this.state.loading) && + { + this.setState({loading: true}) let base64File = await toBase64(file) let pdfDoc = await PDFDocument.load(base64File) const pdfText = await getPDFText(base64File); + const info = await parsePDF(file, pdfDoc, pdfText, file.name) let pdfFile: PDFFile = { name: file.name, file: pdfDoc, - info: parsePDF(pdfDoc, pdfText, file.name) + info: info } this.setState({ - file: pdfFile + file: pdfFile, + loading: false }) - }}/> } - {this.state.file && + {this.state.loading && + + } + {(this.state.file && !this.state.loading) && this.OnGeneration(bibTexEntries, keywords, similarPreprints)} onSubmitLatex={(bibTexEntries, keywords, similarPreprints) => this.OnGeneration(bibTexEntries, keywords, similarPreprints, true)}/> diff --git a/frontend/src/pdf/PDFParser.ts b/frontend/src/pdf/PDFParser.ts index 8e03c04..b1b7158 100644 --- a/frontend/src/pdf/PDFParser.ts +++ b/frontend/src/pdf/PDFParser.ts @@ -1,5 +1,8 @@ import {PDFDocument} from "pdf-lib"; import {extractKeywords} from "../languageProcessing/ExtractKeywords"; +import config from "../config.json"; + +const backendURL = process.env.REACT_APP_BACKEND_URL || config.backend_url; export interface PDFFile { name: string; @@ -22,26 +25,71 @@ export interface PDFInfo { keywords: string[] } -export function parsePDF(file: PDFDocument, text: { firstPage: any; text: string }, name: string): PDFInfo { +export async function parsePDF(file: File, pdf: PDFDocument, text: { + firstPage: any; + text: string +}, name: string): Promise { + const grobidData = await analyzeWithBackend(file) + const maxHeight = Math.max(...text.firstPage.items.map((item: { height: number }) => { return item.height })) let artType = "article" - let author = file.getAuthor() - let title = text.firstPage.items.filter((item: { height: number }) => { + let author = grobidData?.authors || pdf.getAuthor() + let title = grobidData?.title || text.firstPage.items.filter((item: { height: number }) => { return item.height === maxHeight }).map((item: any) => { return item.str - }).join("").trim().replace(/\s+/g, " ") || file.getTitle() || name.substring(0, name.length - 4).replace(/\s+/g, " ") - let pages = file.getPageCount() - let date = file.getCreationDate() || new Date() + }).join("").trim().replace(/\s+/g, " ") || pdf.getTitle() || name.substring(0, name.length - 4).replace(/\s+/g, " ") + let pages = pdf.getPageCount() + let date = grobidData?.date || pdf.getCreationDate() || new Date() let volume let issn let number - let journal + let journal = grobidData?.journal let doi - let artTitle = ((author || title).replace(/\s/g, '')) + date.getFullYear() - let keywords: string[] = extractKeywords(text.text, 10) + let artTitle = ((author?.split(",")[0] || title).replace(/\s/g, '')) + date.getFullYear() + let keywords: string[] = [] + if (grobidData?.keywords){ + grobidData.keywords.forEach((keyword) => keywords.push(keyword)) + } + extractKeywords(text.text, 5).forEach((keyword) => keywords.push(keyword)) return {artType, author, date, pages, artTitle, title, volume, issn, number, journal, doi, keywords} } + +async function analyzeWithBackend(file: File) { + const formData = new FormData(); + formData.append('file', file); + + try { + const response = await fetch(`${backendURL}/extract`, { // Replace with your actual backend endpoint + method: 'POST', + body: formData, + }); + + if (!response.ok) { + return undefined + } + + const data = await response.json(); + console.log(data) + + const title = data?.TEI?.teiHeader?.[0]?.fileDesc?.[0]?.titleStmt?.[0]?.title?.[0]?._ + const journal = data?.TEI?.teiHeader?.[0]?.fileDesc?.[0]?.publicationStmt?.[0]?.publisher?.[0]?._ + const dateString = data?.TEI?.teiHeader?.[0]?.fileDesc?.[0]?.publicationStmt?.[0]?.date?.[0]?.$?.when; + const date = dateString ? new Date(Date.parse(dateString)) : undefined; + const authorObject: any[] | undefined = data?.TEI?.teiHeader?.[0]?.fileDesc?.[0]?.sourceDesc?.[0]?.biblStruct?.[0]?.analytic?.[0]?.author + const authors: string[] = []; + authorObject?.forEach((author: any) => { + if (author.persName) { + authors.push(`${author.persName[0].forename[0]._} ${author.persName[0].surname[0]}`) + } + }) + const keywords: string[] | undefined = data?.TEI?.teiHeader?.[0]?.profileDesc?.[0]?.textClass?.[0]?.keywords?.[0]?.term + + return {title: title, journal: journal, date: date, authors: authors.join(", "), keywords: keywords}; + } catch (error) { + console.error('Error uploading file:', error); + } +} From febe25e0cf780aa841627730315450e48efdeb89 Mon Sep 17 00:00:00 2001 From: "l.kaesberg" Date: Thu, 28 Mar 2024 16:07:33 +0100 Subject: [PATCH 4/4] feat: only expose grobid api in dev --- docker-compose.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 658d268..fe8bd0e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -37,9 +37,6 @@ services: grobid: image: lfoppiano/grobid:0.8.0 - ports: - - "8070:8070" - - "8071:8071" volumes: postgres_data: