#!/usr/bin/env texlua -- extractbb-lua -- https://github.com/gucci-on-fleek/extractbb -- SPDX-License-Identifier: MPL-2.0+ -- SPDX-FileCopyrightText: 2024 Max Chernoff -- -- Inclusion Methods -- ================= -- -- This script can use two different methods to extract bounding boxes from -- images: the "img" module and the "pdfe" module. The "img" module will be -- automatically selected in most cases and supports all image types that are -- supported by the original "extractbb" program. If and only if the "img" -- module fails to load, the "pdfe" module will be used as a fallback. However, -- the "pdfe" module only supports PDF files. Both modules are built in to the -- LuaTeX binaries, however due to some technical issues, the "img" module may -- fail to load on some more exotic platforms. -- -- -- Compatibility -- ============= -- -- Based off of my testing, this Lua script is 100% compatible with the original -- C-based "extractbb" program, with the following exceptions: -- -- * When running in "img" mode, the PDF version is always reported as "1.5". -- -- * When running in "img" mode, if the requested bounding box is not found, -- the script will fallback to the Crop box or the Media box, instead of -- following the original fallback order. (In practice, almost all PDFs set -- all their bounding boxes equal to each other, and even if the boxes are -- set to different values, the script will still return the requested box, -- provided that it is set in the PDF.) -- -- * When running in "pdfe" mode, only PDF files are supported. -- -- All of these issues are very unlikely to affect any real-world documents. -- -- -- Security -- ======== -- -- This script is designed to be safely ran from restricted shell escape. A few -- security features: -- -- * The majority of this script runs inside a sandboxed Lua environment, -- which only exposes a very restricted set of functions. -- -- * All file-related functions available inside the sandbox first check with -- kpathsea to ensure that the file is allowed to be opened. -- -- * In the event of any errors, the script immediately exits. -- -- * This script does not run (fork/exec) any external programs. -- -- * This script is written entirely in Lua, so overflow/use-after-free -- vulnerabilities are not possible. -- -- Some potential security concerns: -- -- * This script has not been audited or reviewed by anyone other than myself. -- -- * Using the "ffi" module to load the "img" library is technically undefined -- behaviour, and as such may potentially lead to unforeseen security -- issues. -- -- * The underlying LuaTeX modules may themselves have security -- vulnerabilities, which would be inherited by this script. ---------------------- --- Initialization --- ---------------------- -- Pre-sandbox variables/constants local show_errors = true local SOURCE_DATE_EPOCH = tonumber(os.getenv("SOURCE_DATE_EPOCH")) local version = "extractbb.lua v1.0.5 (2024-11-21)" --%%version %%dashdate -- Required for any kpathsea calls to work. kpse.set_program_name("texlua", "extractbb") -------------------------- --- Questionable Hacks --- -------------------------- -- LuaTeX doesn't load the "img" library in "texlua" mode, so we need this -- questionable hack to load it manually. We do it inside of "pcall" since there -- are some exotic platforms where the "ffi" module is unsupported. pcall(function() local ffi if img then -- Ok, we're running under a recent LuaTeX that enables the "img" -- library in "texlua" mode, so we can skip the FFI hack. ffi = false else ffi = package.loaded.ffi ffi.cdef[[ typedef struct lua_State lua_State; typedef int (*lua_CFunction) (lua_State *L); lua_State *Luas; void luaL_requiref(lua_State *L, const char *modname, lua_CFunction openf, int glb); int luaopen_img(lua_State * L); int lua_only; ]] -- Basic initialization ffi.C.lua_only = 0 end tex.initialize() -- "tex" module _G.tex = package.loaded.tex tex.enableprimitives("", tex.extraprimitives()) tex.outputmode = 1 tex.interactionmode = 0 -- "pdf" module _G.pdf = package.loaded.pdf pdf.setignoreunknownimages(1) pdf.setmajorversion(2) pdf.setminorversion(0) -- "img" module if ffi then ffi.C.luaL_requiref(ffi.C.Luas, "img", ffi.C.luaopen_img, 1) end end) -- In case of failure, define an empty "img" table. if not img then _G.img = {} end ------------------ --- Sandboxing --- ------------------ -- Prepare the sandbox for the rest of the script. local env = { arg = arg, io = { stdout = io.stdout, }, ipairs = ipairs, math = math, os = { date = os.date, exit = os.exit, }, pairs = pairs, pdfe = pdfe, print = print, select = select, table = table, tonumber = tonumber, type = type, } do -- Saved global functions local debug_traceback = debug.traceback local find_file = kpse.find_file local img_scan = img.scan local io_open = io.open local io_stderr = io.stderr local kpse_in_name_ok = kpse.in_name_ok local kpse_out_name_ok = kpse.out_name_ok local kpse_var_value = kpse.var_value local lfs_attributes = lfs.attributes local os_exit = os.exit local os_setenv = os.setenv local pdfe_open = pdfe.open local select = select local tostring = tostring -- Error messages local function error(...) if show_errors then -- Header io_stderr:write("! extractbb ERROR: ") -- Message for i = 1, select("#", ...) do io_stderr:write(tostring(select(i, ...)), " ") end -- Traceback io_stderr:write("\n", "\n") io_stderr:write(debug_traceback(nil, 2), "\n") end -- Flush and exit io_stderr:flush() os_exit(1) end env.error = error -- Make sure that "openin_any" is at least "restricted", and that -- "openout_any" is at least "paranoid". local initial_openin = kpse_var_value("openin_any") local initial_openout = kpse_var_value("openout_any") if (initial_openin ~= "r") or (initial_openout ~= "p") then os_setenv("openin_any", "r") end if (initial_openout ~= "p") then os_setenv("openout_any", "p") end -- Check the input paths. local function resolve_input_name(file_name) local file_path = find_file(file_name, "graphic/figure", true) if not file_path then error("Cannot find input file:", file_name) end local allowed = kpse_in_name_ok(file_path) if not allowed then error("Input file is not allowed:", file_path) end local mode = lfs_attributes(file_path, "mode") if mode ~= "file" then error("Input file is not a regular file:", file_path) end return file_path end -- Check the output paths. local function resolve_output_name(file_name) local allowed = kpse_out_name_ok(file_name) if not allowed then error("Output file is not allowed:", file_name) end local name, extension = file_name:match("(.+)%.([^.]-)$") if (not name) or (not extension) or (name == "") or (extension == "") then error("Output file has no extension:", file_name) end if (extension ~= "xbb") and (extension ~= "bb") then error("Output file has an invalid extension:", file_name) end -- We shouldn't allow files with weird characters in their names. if name:match("[%c%%\t\r\n><*|]") then error("Output file has an invalid name:", file_name) end return file_name end -- Opens a file. function env.open_file(file_name, read_write, binary_text) local file_path, mode if read_write == "read" then file_path = resolve_input_name(file_name) mode = "r" elseif read_write == "write" then file_path = resolve_output_name(file_name) mode = "w" else error("Invalid read/write mode:", read_write) end if binary_text == "binary" then mode = mode .. "b" elseif binary_text == "text" then mode = mode .. "" else error("Invalid binary/text mode:", binary_text) end local file, message = io_open(file_path, mode) if not file then error("Cannot open file:", file_path, message) end return file end -- Open an PDF file. function env.pdfe.open(file_name) local file_path = resolve_input_name(file_name) return pdfe_open(file_path) end -- Open an image file. function env.open_image(file_name, page, box) local file_path = resolve_input_name(file_name) return img_scan { filename = file_path, filepath = file_path, page = page, pagebox = box, } end if not img_scan then env.open_image = false end end -- Prevent trying to change the environment. local function bad_index(...) env.error("Attempt to access an undefined index:", select(2, ...)) end setmetatable(env, { __index = bad_index, __metatable = false, __newindex = bad_index, }) -- Set the environment. _ENV = env ----------------------------------- --- Post-Sandbox Initialization --- ----------------------------------- -- Constants local BP_TO_SP = 65781.76 local IN_TO_BP = 72 local DATE_FORMAT = "%a %b %d %H:%M:%S %Y" -- "%c" -- Save often-used globals for a slight speed boost. local floor = math.floor local insert = table.insert local remove = table.remove local script_arguments = arg local unpack = table.unpack -- General-purpose functions local function round(number) return floor(number +0.5) end ------------------------- --- Argument Handling --- ------------------------- -- Define the argument handling functions. local process_arguments = {} -- > Specify a PDF pagebox for bounding box -- > pagebox=cropbox, mediabox, artbox, trimbox, bleedbox local bbox_option = "auto" function process_arguments.B(script_arguments) bbox_option = remove(script_arguments, 1) end -- > Show this help message and exit function process_arguments.h(script_arguments) print [[ Usage: extractbb [-B pagebox] [-p page] [-q|-v] [-O] [-m|-x] FILE... extractbb --help|--version Extract bounding box from PDF, PNG, JPEG, JP2, or BMP file; default output below. Options: -B pagebox Specify a PDF pagebox for bounding box pagebox=cropbox, mediabox, artbox, trimbox, bleedbox -h | --help Show this help message and exit --version Output version information and exit -p page Specify a PDF page to extract bounding box -q Be quiet -v Be verbose -O Write output to stdout -m Output .bb file used in DVIPDFM (default) -x Output .xbb file used in DVIPDFMx ]] os.exit(0) end process_arguments["-help"] = process_arguments.h -- > Output version information and exit function process_arguments.V(script_arguments) print(version) os.exit(0) end process_arguments["-version"] = process_arguments.V -- > Specify a PDF page to extract bounding box local page_number = 1 function process_arguments.p(script_arguments) page_number = tonumber(remove(script_arguments, 1)) end -- > Be quiet function process_arguments.q(script_arguments) show_errors = false end -- > Be verbose function process_arguments.v(script_arguments) show_errors = true end -- > Write output to stdout local output_file function process_arguments.O(script_arguments) output_file = io.stdout end -- Output format local output_format = "xbb" if script_arguments[0]:match("ebb") then output_format = "bb" end -- > Output .bb file used in DVIPDFM (default) function process_arguments.m(script_arguments) output_format = "bb" end -- > Output .xbb file used in DVIPDFMx function process_arguments.x(script_arguments) output_format = "xbb" end -- Get the input file name. local input_name function process_arguments.i(script_arguments) input_name = remove(script_arguments, 1) end process_arguments["-input-name"] = process_arguments.i -- Clear the interpreter and script names. script_arguments[-1] = nil script_arguments[0] = nil -- Process the arguments. while script_arguments[1] do -- Get the next argument. local arg = remove(script_arguments, 1) local cmd = arg:match("^%-(.*)$") -- Default to "--input-name" if no command is given. if not cmd then insert(script_arguments, 1, arg) cmd = "-input-name" end -- Handle multi-character arguments. if (cmd:len() >= 2) and (not cmd:match("^%-")) then local i = 0 for char in cmd:gmatch(".") do i = i + 1 insert(script_arguments, i, "-" .. char) end goto continue end -- Get the function to process the argument and run it. local func = process_arguments[cmd] if not func then error("Invalid argument:", arg) end func(script_arguments) ::continue:: end -- Validate the arguments. if not type(page_number) == "number" then error("Invalid page number:", page_number) end if not input_name then error("No input file specified.") end -- Validate the bounding box type. We need this rather crazy fallback scheme -- to match the behaviour of "extractbb". local bbox_orders = {} bbox_orders.mediabox = { { img = "media", pdfe = "MediaBox" }, } bbox_orders.cropbox = { { img = "crop", pdfe = "CropBox" }, unpack(bbox_orders.mediabox) } bbox_orders.artbox = { { img = "art", pdfe = "ArtBox" }, unpack(bbox_orders.cropbox) } bbox_orders.trimbox = { { img = "trim", pdfe = "TrimBox" }, unpack(bbox_orders.artbox) } bbox_orders.bleedbox = { { img = "bleed", pdfe = "BleedBox" }, unpack(bbox_orders.trimbox) } bbox_orders.auto = { bbox_orders.cropbox[1], bbox_orders.artbox[1], bbox_orders.trimbox[1], bbox_orders.bleedbox[1], bbox_orders.mediabox[1], } local bbox_order = bbox_orders[bbox_option] if not bbox_order then error("Invalid PDF box type:", bbox_option) end -- Set the default pixel resolution. local default_dpi if output_format == "xbb" then default_dpi = 72 elseif output_format == "bb" then default_dpi = 100 else error("Invalid output format:", output_format) end -- Open the output file. if not output_file then local base_name = input_name:match("(.+)%.([^.]-)$") or input_name local output_name = base_name .. "." .. output_format output_file = open_file(output_name, "write", "text") end ------------------------ --- Image Processing --- ------------------------ local x_min, y_min, x_max, y_max local num_pages, image_type local pdf_major_version, pdf_minor_version if open_image then -- Check the number of pages. local image = open_image(input_name) num_pages = image.pages if page_number > num_pages then error("Invalid page number:", page_number) end -- Open the image to the specified page and bounding box. If the requested -- bounding box is not available, LuaTeX will fall back to the crop box -- or the media box. image = open_image(input_name, page_number, bbox_order[1].img) if not image then error("Cannot open image:", input_name) end -- Get the image metadata. image_type = image.imagetype local bounding_box = image.bbox if not bounding_box then error("Cannot get bounding box:", page_number) end local x_resolution = image.xres local y_resolution = image.yres if (x_resolution or 0) == 0 then x_resolution = default_dpi end if (y_resolution or 0) == 0 then y_resolution = default_dpi end -- Convert the bounding box to PostScript points. for i, dimen in ipairs(bounding_box) do if image_type == "pdf" then dimen = dimen / BP_TO_SP else if i % 2 == 1 then dimen = dimen / x_resolution * IN_TO_BP else dimen = dimen / y_resolution * IN_TO_BP end end bounding_box[i] = dimen end -- Save the bounding box. x_min, y_min, x_max, y_max = unpack(bounding_box) -- We can't get the PDF version with the "img" library, so we'll just -- pretend that it's v1.5 (which supports most features). pdf_major_version = 1 pdf_minor_version = 5 else -- Fallback to PDFs only. image_type = "pdf" local document = pdfe.open(input_name) if pdfe.getstatus(document) ~= 0 then error("Cannot open PDF file:", input_name) end -- Check the number of pages. num_pages = pdfe.getnofpages(document) if type(num_pages) ~= "number" then error("Invalid number of pages:", num_pages) end if page_number > num_pages then error("Invalid page number:", page_number) end -- Get the page. local page = pdfe.getpage(document, page_number) if not page then error("Cannot get page:", page_number) end -- Get the bounding box. Here, we check the boxes in the exact same order -- that "extractbb" does. local bounding_box for _, bbox in ipairs(bbox_order) do bounding_box = pdfe.getbox(page, bbox.pdfe) if bounding_box then break end end if not bounding_box then error("Cannot get bounding box:", page_number) end -- Save the bounding box. x_min, y_min, x_max, y_max = unpack(bounding_box) -- Get the PDF version. pdf_major_version, pdf_minor_version = pdfe.getversion(document) end -- Validate the bounding box. for _, dimen in ipairs { x_min, y_min, x_max, y_max } do if type(dimen) ~= "number" then error("Invalid bounding box:", x_min, y_min, x_max, y_max) end end -------------- --- Output --- -------------- -- Get the output fields and values. local lines = {} insert(lines, ("Title: %s"):format(input_name)) insert(lines, ("Creator: %s"):format(version)) insert(lines, ("BoundingBox: %d %d %d %d") :format(round(x_min), round(y_min), round(x_max), round(y_max))) if output_format == "xbb" then insert(lines, ("HiResBoundingBox: %0.6f %0.6f %0.6f %0.6f") :format(x_min, y_min, x_max, y_max)) if image_type == "pdf" then insert(lines, ("PDFVersion: %d.%d") :format(pdf_major_version, pdf_minor_version)) insert(lines, ("Pages: %d"):format(num_pages)) end end insert(lines, ("CreationDate: %s"):format(os.date(DATE_FORMAT, SOURCE_DATE_EPOCH))) -- Create the output text. local begin_line = "%%" local end_line = "\n" local text = begin_line .. table.concat(lines, end_line .. begin_line) .. end_line .. end_line -- Write the output text. output_file:write(text) output_file:close() -- Everything is done, so now we can exit. os.exit(0)