I need a javascript that can extract specific text from a PDF - javascript

I work as legal support in litigation. I am not that clued up on scripting, but have managed to adapt a few google searches to perform various tasks for in Adobe.
What I need is help with what I think should be a simple script to read through a PDF and extract Document IDs. They are enclosed in square brackets, so I just need to extract all text between square brackets to a text or CSV file. I have tried using the ChatGPT bot but that hasnt been very successful. This is the code it has given me
// Open the PDF file
var filePath = "/path/to/your/file.pdf";
var doc = app.open(filePath);
// Get the number of pages
var numPages = doc.numPages;
// Create an array to hold the results
var results = [];
// Loop through each page and extract text between square brackets
for (var i = 0; i < numPages; i++) {
var page = doc.getPageNthWordQuads(i);
for (var j = 0; j < page.length; j++) {
var word = page[j];
var text = word[4];
// Check if the text is between square brackets
if (text.startsWith("[") && text.endsWith("]")) {
// Remove the brackets and add the text to the results array
results.push(text.slice(1, -1));
}
}
}
// Save the results to a text file
var outputPath = "/path/to/your/output/file.txt";
var outputFile = new File(outputPath);
outputFile.open("w");
outputFile.write(results.join("\n"));
outputFile.close();
// Close the PDF file
doc.close();
I ran the script, with my file directory, not the placeholder in the script, but nothing happened. No error or anything
I am using a work PC so I cant install python or any other program, hence the need for Java or possibly powershell if that will work
Can anyone help me?

Actually I realised i could do this using the Evermap plugin. Highlight text by pattern - [(.*?)], then extract highlighted text.

This can be achieved with pdf.js library: below example shows if specific text in the first page but can be furhter extended to check the whole pdf. Hope this helps!
// Load PDF.js library
const pdfjsLib = require('pdfjs-dist');
// Load PDF file
const url = 'path/to/pdf/file.pdf';
const loadingTask = pdfjsLib.getDocument(url);
loadingTask.promise.then(function(pdf) {
// Load the first page
pdf.getPage(1).then(function(page) {
// Get the text content of the page
page.getTextContent().then(function(textContent) {
// Iterate through each text item
for (let i = 0; i < textContent.items.length; i++) {
const item = textContent.items[i];
// Check if the text item matches your criteria
if (item.str.includes('specific text')) {
console.log(item.str);
}
}
});
});
});

You can get rid of the entire for loop and just use a regular expression with String.match():
const data = "This is the text ofa pdf file [documentid1] and [documentid2].";
const matches = data.match(/(?<=\[).*?(?=\])/gs);
console.log(matches);

Related

Why won't the makeCopy portion of my script execute properly in Google Apps Script?

I am a very novice coder and am trying to accomplish the following using a Google Form:
Rename file uploaded by user based on name defined by combination of form fields
Create a copy of the uploaded file to a specific folder in GDrive, based on answer to particular form question
So far, I have managed to get Part 1 working, but Part 2 doesn't seem to function properly (no error message, just no action). Anyone able to guide me where I'm going wrong?
function fileRename() {
var form = FormApp.getActiveForm()
// returns the total number of form submissions
var length=form.getResponses().length;
//retrieve fileID of document uploaded by user in Question 6 of the form (i.e. Index 5)
var id=form.getResponses()[length-1].getItemResponses()[5].getResponse();
//getResponses()[length-1] retrieves the last form response, accounting for the fact that the first index is zero and hte last length-1
//gets the form answers used to concatenate the file name
var fileUploadEntity=form.getResponses()[length-1].getItemResponses()[0].getResponse();
var fileUploadDate=form.getResponses()[length-1].getItemResponses()[3].getResponse();
var fileUploadType=form.getResponses()[length-1].getItemResponses()[1].getResponse();
//accesses the uploaded file
var file=DriveApp.getFileById(id);
var name = file.getName();
//changes the file name
var name = fileUploadEntity+'_'+fileUploadDate+'_'+fileUploadType
file.setName(name);
//creates a copy and saves it to the relevant regional shared drive depending on which array the entity belongs to, using its four-letter identifier
var APAC = ["WRAU", "WRNZ", "WRSG", "WRMY", "WRHK"];
var NORAM = ["WRCA", "WRCC", "WRCW", "WRUS"];
var MEA = ["WRKE", "WRUG", "WRSO", "WRSA", "WRRW", "WRTZ", "WRZW"];
var LATAM = ["WRMX"];
var EEA = ["WRBE", "WRUK"];
var folderAPAC = DriveApp.getFolderById('1IKIDSEEGHf802WaF4l4ntN9uiUO5jJpa');
var folderNORAM = DriveApp.getFolderById('1BitldN3Uw7453wxnnI1X5PUmbmTiQn5O');
var folderMEA = DriveApp.getFolderById('18tWR1C-mdO7moAtktOHJsvXjx_V0kdg0');
var folderLATAM = DriveApp.getFolderById('1cG0iPocn3KyXK8XgaxnZNWVU-HKJ97dX');
var folderEEA = DriveApp.getFolderById('1N8tB8AjMkR7gRarcwd4NYmry_wh0WVkY');
if (fileUploadEntity.indexOf(APAC)>-1) {
file.makeCopy(name, folderAPAC);
}
else if (fileUploadEntity.indexOf(NORAM)>-1) {
file.makeCopy(name, folderNORAM);
}
else if (fileUploadEntity.indexOf(LATAM)>-1) {
file.makeCopy(name, folderLATAM);
}
else if (fileUploadEntity.indexOf(MEA)>-1) {
file.makeCopy(name, folderMEA);
}
else if (fileUploadEntity.indexOf(EEA)>-1) {
file.makeCopy(name, folderEEA);
}
}
You code is using indexOf the wrong way.
Instead of
fileUploadEntity.indexOf(APAC)
try
APAC.indexOf(fileUploadEntity)
Do the same or the other places where indexOf is used
Reference
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/indexOf

Adobe Illustrator JavaScript - Applying a dataset to each open document in a loop

I've written a simple script to open two documents in Adobe Illustrator (File -> Scripts) and apply some commands to those documents.
The purpose of the script; with no documents open in Illustrator, the script will:
Open the required documents.
Load an XML variable library into those documents.
Display the dataset in that library (there is only one dataset).
Save the file in a given location.
Close the file.
The script:
try
{
var doc_1 = open(File("C:/file1.ai"));
var doc_2 = open(File("C:/file2.ai"));
var sourceDoc;
var targetFile;
var options = new IllustratorSaveOptions();
for (var i = 0; i < app.documents.length; i++ )
{
sourceDoc = app.documents[i];
sourceDoc.importVariables(new File("C:/variables.xml"));
sourceDoc.dataSets[0].display();
targetFile = new File("C:/output" + sourceDoc.name + '.ai');
sourceDoc.saveAs( targetFile, options );
sourceDoc.close();
}
}
catch(err)
{
alert(err);
}
This code works only for the last opened file (file2.ai). file1.ai is never touched, almost as if there is no loop.
What could be causing this?
When closing the first file in the loop you just moved app.documents[1] to app.documents[0]
You want top loop through them backwards
try your loop with
for (var i = app.documents.length; i >0 ; i-- )

How to use 'row elements' of excel file(xlsx) in CasperJS?

I want to import excel file (it consists of one column and many rows of string ) to my JavaScript and I'll use these strings elements of .xlsx to searching text automatically through CasperJS
How can I import excel file and make all of elements take turns applied?
Here is my code and I want to put the elements of excel file to "something"
casper.start('http://thehomepage.com/');
// start at homepage
casper.then( function (){
this.sendKeys('#dicQuery','**something**');
// I want to put my elements iteratelly
console.log('entering text');
});
casper.thenClick(x('//*[#id="field"]/a'), function(){
console.log('click searching');
});
casper.then(function() {
words = this.evaluate(getWords);
});
function createFinal(wordArray) {
var out = [];
// remove duplicating START
var a = {};
for(var i=0; i <wordArray.length; i++){
if(typeof a[wordArray[i]] == "undefined")
a[wordArray[i]] = 1;
}
wordArray.length = 0;
for(i in a)
wordArray[wordArray.length] = i;
// remove duplicating END
wordArray.forEach(function(my_word) {
out.push({"moeum": "**something**", "word": my_word});
}); // I want to put my elements in it iteratelly
return out;
}
I don't think there is xlsx file reader in PhantomJS (and therefore CasperJS), but you can save your xlsx file as csv. Since it is a simple text file, then you can read it and build your sheet yourself.
For example:
var fs = require("fs");
var sheet = fs.read("data.csv")
.split("\n")
.map(function(row){
return row.split(";"); // or which even split character your have chosen for CSV
});
Then you can access it like this:
sheet[rowIndex][colIndex]

Javascript ignoring all files as opposed to one

I'm trying to get my javascript to ignore one file type extension that's held in a folder with a bunch of photoshop images. For all of the other file types in the folder I have it so that these file types populate a window and the user can import into their work space.
I have modified my script to ignore the file extension I want ignored, however it no longer populates the window with all of the other file types containted in the folder. But when I take out the file I want ignore from the folder, the window gets populated as it should.
This is what I have at the moment that checks my folder for the file types:
//Prompt for folder location
var Path = Folder.selectDialog("Select Folder Location for Renders")
// Use the path to the application and append the samples folder
var samplesFolder = Folder(Path)
//Get the files
var fileList = samplesFolder.getFiles()
//Creat Array to hold names
var renderTypes = new Array();
//Parse Initial name to get similar render elements
var beautyRender = fileList[0].name
beautyRender = beautyRender.substr(0, beautyRender.length-4)
//Get the render elements with a similar name
for (var i = 0; i < fileList.length; i++)
{
var filename = fileList[i].name;
if (filename.match(/\.(stitchinfo)$/i) == null)
{
if(fileList[i].name.substring(0,beautyRender.length) === beautyRender)
{
renderTypes.push(fileList[i].name);
}
}
}
Can anyone see what I've done wrong and need to modify?
Update
I'm still trying to get this to work and following the help from one of the posters below I have modified my code to the following:
for (var i = 0; i < fileList.length; i++)
{
var filename = fileList[i].name;
if (filename.match(/\.(stitchinfo)$/i) == null)
{
renderTypes.push(fileList[i].name);
}
}
However, with this new code comes a new problem in that it returns every file contained in the folders and displays it.
I'm still stumped as to how I can get this to work as I would like. Please can anyone help me?
You're creating a sparse array, because you skip elements in renderTypes when you ignore a filename in fileList. That may be confusing your rendering code. Change to:
renderTypes.push(fileList[i].name);
What if :
for (var i = 0; i < fileList.length; i++)
{
var filename = fileList[i].name;
if (filename.match(/\.(stitchinfo)$/i) == null)
{
if(fileList[i].name.substring(0,beautyRender.length) === beautyRender)
{
renderTypes.push(fileList[i].name);
}
}
}
Wrong usage of the array
Missing ";"
Unnecessary use of "continue".
Managed to get a fix for this.
What I've ended up doing is creating a new function and passing that into my call for checking the folder locations.
function ignoreMe(f)
{
return f.name.match(/\.(stitchinfo)$/i) == null;
}
And the folder check:
var fileList = samplesFolder.getFiles(ignoreMe);

Can't get element javascript google apps script

I'm trying to get the link element in a feed, i can get the description and title, but cannot get the link element. It seems weird to me. Here is my code
var url = "http://healthyhow.net/feed";
var response = UrlFetchApp.fetch(url);
var shareDoc = Xml.parse(response.getContentText(), true);
var root = shareDoc.getElement(); // feed element
var entries = root.getElement("channel").getElements("item");
for (var i=0; i<1; i++) { //just pick the first entry
var e = entries[i];
var title = e.getElement("title").getText();
var link = e.getElement("link").getText();
var description = e.getElement("description").getText();
}
Could anyone point out what is wrong here?
Thanks!
The docs indicate you should use lenient parsing for HTML - it's unclear what this is doing underneath, but in your case maybe it's confusing an HTML <link> element with a generic XML element with that same tag name. It appears to parse the link entries into something like this for your code (which you can see in shareDoc.toXmlString()):
<link/>http://healthyhow.net/l-arginine-natural-treatment-for-hypertension/
Since it's an empty tag, no text.
Change:
var shareDoc = Xml.parse(response.getContentText(), true);
to be:
var shareDoc = Xml.parse(response.getContentText(), false);
and you should be able to get the link text.

Categories