/**
* @fileOverview Houses the core [nb2json](module-convert.html#.nb2json) function and accompanying utils.
* Functions exposed from [browser](module-Ipynb2web_browser.html) and [node](module-Ipynb2web_node.html).
*
* Where processing happens
* - -1 - Calling nb2json - yaml filename returned gets formatted
* - 0 - nb2json - meta.filename is fixed up right before returning too
* - 0 - nb2json - meta.prettify inserts script
* - 0 - nb2json - replaceEmojies
* - 0 - nb2json - convertNotes
* - 1 - get_metadata - yaml is parsed, title, summary, keyValues set
*
* @module convert
* @exports {Object} - An object containing utility functions.
* @author Charles Karpati
*/
import { marked } from "marked";
import { makeDetails, replaceEmojis, convertNotes, replaceAndLog, collapseHeaders } from './convert_util.mjs'
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// fname = ./src/ipynb/route/filename (wihout the .ipynb extension, when server calling it)
// fname = /route/filename when from client
// meta.filename = fname UNDERCASED WITH SPACES REPLACED WITH UNDERSCORES.
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
let prettify = false;
let pyCode = [];
let assetsToWrite = [];
let imageIndex = 0;
let footnoteCount = 0;
/**
* Converts a Jupyter Notebook (.ipynb) file to a JSON object containing metadata and content as two distinct entries.
*
* @async
* @param {string} ipynbPath - The path to the Jupyter Notebook file.
* @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
* @param {string[]|boolean} [extractAssets=false] - Array of asset types to extract (e.g., ['png', 'js', 'txt', 'html']) or boolean for backward compatibility.
* @returns {Object} An object with metadata and processed content of the notebook.
* @memberof module:convert
*/
async function nb2json(ipynbPath, verbose = false, extractAssets = false) {
pyCode = []
prettify = false;
assetsToWrite = [];
imageIndex = 0;
footnoteCount = 0;
let url = ipynbPath;
if (typeof process !== "undefined" && !ipynbPath.startsWith("http")) {
url = `http://localhost:8085/${ipynbPath}.ipynb`;
}
let ipynb = await fetch(url, { headers: { "Content-Type": "application/json; charset=utf-8" } });
// console.log('url', url);
// console.log('ipynb', ipynb);
const nb = await ipynb.json();
// console.log('nb', nb);
const meta = get_metadata(nb.cells[0]);
meta.filename = ipynbPath.split("/")[ipynbPath.split("/").length - 1].toLowerCase().replaceAll(" ", "_");
verbose && console.log('- get_metadata', meta, '\n');
// Convert file
let content = convertNb(nb.cells.slice(1), meta, verbose, extractAssets, meta.filename).flat().join(" ");
verbose && pyCode.length && console.log({ pyCode });
meta.pyCode = pyCode;
(meta.prettify || prettify) &&
(content += `
<script src="https://cdn.jsdelivr.net/gh/google/code-prettify@master/loader/run_prettify.js"></script>
<link rel="stylesheet" href="https://cdn.rawgit.com/google/code-prettify/master/styles/desert.css"/>
`);
// verbose && console.log('- - content Ran ~~~~~~~~~~~', content, '~~~~~~~~~~~\n');
let resp = replaceEmojis(content);
verbose && console.log('- - replaceEmojis Ran', '\n');
resp = collapseHeaders(resp, meta.collapse, false);
verbose && console.log('- - collapseHeaders Ran', '\n');
resp = collapseHeaders(resp, meta.collapsable, true);
verbose && console.log('- - collapsableHeaders Ran', '\n');
return { meta, content: resp, assets: assetsToWrite };
}
/**
* Extracts metadata from the first cell of a Jupyter Notebook, interpreting it as YAML.
* Get markdown and check EACH LINE for yaml. Special characters must have a space after them.
*
* The Lines:
* ```
* # Title
* > summary
* - key1: value1"
* ```
* Will return:
* ```
* { title: "Title", summary: "summary", key1: "value1" }
* ```
*
* @param {Object[]} data - An array of cells from a Jupyter Notebook.
* @returns {Object} An object containing extracted metadata like title, summary, and key-values.
*/
function get_metadata(data) {
const returnThis = {};
for (const line of data.source) {
if (line.startsWith("#")) {
returnThis.title = line.replaceAll("\n", "").replaceAll("# ", "", 2);
} else if (line.startsWith(">")) {
returnThis.summary = line.replaceAll("\n", "").replaceAll("> ", "", 1);
} else if (line.startsWith("-")) {
const key = line.slice(line.indexOf("- ") + 2, line.indexOf(": "));
const val = line
.slice(line.indexOf(": ") + 2)
.replaceAll("\n", "")
.trim();
returnThis[key] = val;
}
}
return returnThis;
}
/**
* Processes each cell of a Jupyter Notebook and returns an array of converted content.
*
* @param {Object[]} cells - An array of cells from a Jupyter Notebook.
* @param {Object} meta - Metadata associated with the notebook.
* @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
* @param {string[]|boolean} [extractAssets=false] - Array of asset types to extract (e.g., ['png', 'js', 'txt', 'html']) or boolean for backward compatibility.
* @param {string} [notebookName=null] - The name of the notebook for asset naming.
* @returns {string[]} An array of strings representing the processed content of each cell.
*/
function convertNb(cells, meta, verbose = false, extractAssets = false, notebookName = null) {
verbose && console.group('- convertNb Running');
let returnThis = cells.map((c) => cleanCell(c, meta, verbose, extractAssets, notebookName));
verbose && console.groupEnd();
return returnThis;
}
/**
* Processes an individual cell from a Jupyter Notebook, handling either markdown or code cells.
* Returns text or passes cell to 'code cell' processor
*
* @param {Object} cell - A cell from a Jupyter Notebook.
* @param {Object} meta - Metadata associated with the notebook.
* @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
* @param {string[]|boolean} [extractAssets=false] - Array of asset types to extract (e.g., ['png', 'js', 'txt', 'html']) or boolean for backward compatibility.
* @param {string} [notebookName=null] - The name of the notebook for asset naming.
* @returns {string} The processed content of the cell.
*/
function cleanCell(cell, meta, verbose = false, extractAssets = false, notebookName = null) {
let x;
if (cell["cell_type"] == "markdown") {
x = processMarkdown(cell["source"].join(" "))
// verbose && console.log('- - - Parsing Markdown', x);
} else {
// verbose && console.log('- - Parsing Code');//, cell ,'\n');
x = processCode(cell, meta, verbose, extractAssets, notebookName);
}
return x;
}
/**
* Processes markdown content, converting it to HTML, handling special syntax, and applying transformations.
*
* @param {string} x - The markdown content to be processed.
* @returns {string} The processed HTML content.
*/
function processMarkdown(txt) {
// Does not process markdown wrapped in html
let x = marked(txt);
// Two spaces at lines end transform into line breaks
x = x.replace(/\s{2,}<\/p>/g, "</p><br>");
// Remove newline chars even though they dont get rendered.
// x = x.replace(/\n/g, '');
// replace code blocks with pre.prettyprint
x = replaceAndLog(x, /<pre><code>([\s\S]*?)<\/code><\/pre>/g, (match, content) => { prettify = true; return `<pre class='prettyprint'>${content}</pre>`; });
// Single line code blocks do NOT get prettified
// x = replaceAndLog(x, /<code>([\s\S]*?)<\/code>/g, (match, content) => { prettify = true; return `<pre class='prettyprint' style='display:inline'>${content}</pre>`; });
// Open links in new tab
x = replaceAndLog(x, /<a\s+(?:[^>]*?\s+)?href="(.*?)"/g, (match, href) => {
if (!href.startsWith("./")) {
match += ' target="_blank" rel="nosopener noreferrer nofollow"';
}
return match;
});
// create spans, inline footnotes ( Here is an inline note.^[Inlines notes are] ) , create elements ( :::{#id .class} )
const result = convertNotes(x, footnoteCount);
x = result.content;
footnoteCount = result.count;
return x
}
/**
* Processes a code cell from a Jupyter Notebook, applying various transformations based on flags and output type.
*
* Calls [getFlags](module-convert.html#.getFlags), [processSource](module-convert.html#.processSource), [processOutput](module-convert.html#.processOutput)
*
* @param {Object} cell - A code cell from a Jupyter Notebook.
* @param {Object} meta - Metadata associated with the notebook.
* @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
* @param {string[]|boolean} [extractAssets=false] - Array of asset types to extract (e.g., ['png', 'js', 'txt', 'html']) or boolean for backward compatibility.
* @param {string} [notebookName=null] - The name of the notebook for asset naming.
* @returns {string[]} An array of strings representing the processed content of the code cell.
*/
function processCode(cell, meta, verbose = false, extractAssets = false, notebookName = null) {
// verbose && console.log('- - - processCode Running');
let x = [];
let flags = [];
// source
// verbose && console.group('ProcessCode');
if (cell["source"].length) {
let source = cell["source"];
flags = getFlags(source[0]);
// verbose && console.log('Input: ', {'Raw': cell['source'], 'Flags': flags } )
if (flags.length > 0) { source = source.slice(1) }
source = processSource(source.join(" "), flags, meta);
x.push(source);
}
// output
if (cell["outputs"].length) {
// verbose && console.log(flags, cell['outputs'])
for (let o of cell["outputs"]) {
x.push(processOutput(o, flags, verbose, extractAssets, notebookName));
}
// clear_output();
}
// verbose && console.groupEnd();
return x;
}
/**
* Detects special flags in the source code of a notebook cell and handles them accordingly.
*
* @memberof module:convert
* @param {string} source - The source code of a notebook cell.
* @returns {string[]} An array of detected flags in the cell's source code.
*/
function getFlags(source) {
const input_aug = [
"#collapse_input_open",
"#collapse_input",
"#collapse_output_open",
"#collapse_output",
"#hide_input",
"#hide_output",
"#hide",
"%%capture",
"%%javascript",
"%%html",
"#export"
];
const sourceFlags = source.split(/\s+/); // Split by whitespace
return input_aug.filter((x) => sourceFlags.includes(x));
}
/**
* Processes the source of a code cell, applying transformations based on flags and metadata.
* Strip Flags from text, make details, hide all. Append to pyCode
*
* @memberof module:convert
* @param {string} source - The source code of a notebook cell.
* @param {string[]} flags - An array of flags affecting the processing.
* @param {Object} meta - Metadata associated with the notebook.
* @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
* @returns {string} The processed source code.
*/
function processSource(source, flags, meta, verbose = false) {
if ('#export' == flags[flags.length - 1]) { pyCode.push(source); }
for (let lbl of flags) {
let skipList = ["#hide", "#hide_input", "%%javascript", "%%html", "%%capture"]
if (skipList.includes(lbl)) { return ""; }
}
if (meta.prettify) { source = `<pre class='prettyprint'>${source}</pre>`; }
let flagg = flags && !!flags.includes('#collapse_input_open')
if (flagg) {
verbose && console.log(flags)
for (let lbl of flags) {
source = source.replaceAll(lbl + "\r\n", "");
source = source.replaceAll(lbl + "\n", ""); // Strip the Flag
if (lbl == "#collapse_input_open") source = makeDetails(source, true, 'input');
else if (lbl == "#collapse_input") source = makeDetails(source, false, 'input');
}
return source;
}
}
/**
* Processes the output of a code cell, applying transformations based on flags and output type.
* Strip Flags from output, make details, hide all.
*
* @function processOutput
* @memberof module:convert
* @param {Object} source - The output of a code cell.
* @param {string[]} flags - An array of flags affecting the processing.
* @param {boolean} [verbose=false] - If set to true, enables verbose logging for detailed information.
* @param {string[]|boolean} [extractAssets=false] - Array of asset types to extract (e.g., ['png', 'js', 'txt', 'html']) or boolean for backward compatibility.
* @param {string} [notebookName=null] - The name of the notebook for asset naming.
* @returns {string} The processed output content.
*/
function processOutput(source, flags, verbose = false, extractAssets = false, notebookName = null) {
// console.log('processOutput', source);
if (source["output_type"] == "error") {
return "";
}
if (source["output_type"] == "stream") {
if (source["name"] == "stderr") {
return "";
}
source["data"] = { "text/html": source["text"] };
}
const keys = Object.keys(source["data"]);
// Debug logging to see what's happening
if (verbose || extractAssets) {
console.log('processOutput debug:', {
keys,
data: source["data"],
hasTextHtml: keys.includes("text/html"),
hasTextPlain: keys.includes("text/plain"),
hasAppJs: keys.includes("application/javascript"),
imageKeys: keys.filter(k => k.startsWith('image/'))
});
}
const shouldExtract = (type) => {
const result = extractAssets === true || (Array.isArray(extractAssets) && extractAssets.some(t =>
t.toLowerCase() === type || t.toLowerCase() === type.split('/')[1] ||
(type === 'application/javascript' && t.toLowerCase() === 'js') ||
(type === 'text/html' && t.toLowerCase() === 'html')));
// Debug logging for shouldExtract
if ((verbose || extractAssets) && (type === 'text/html' || type === 'application/javascript')) {
console.log('shouldExtract debug:', {
type,
extractAssets,
result,
isArray: Array.isArray(extractAssets)
});
}
return result;
};
if (keys.includes("text/html")) {
const data = source["data"]["text/html"];
source = Array.isArray(data) ? data.join("") : data;
// Calculate size in bytes
const sizeInBytes = new TextEncoder().encode(source).length;
const sizeThreshold = 100 * 1024; // 100KB
// Only extract if starts with doctype, <html>, or is larger than 100KB
const startsWithDoctype = source.toLowerCase().includes('<!doctype');
const startsWithHtml = source.toLowerCase().trim().startsWith('<html');
const isLargeEnough = sizeInBytes > sizeThreshold;
if (shouldExtract('text/html') && typeof process !== "undefined" && (startsWithDoctype || startsWithHtml || isLargeEnough)) {
const hash = source.substring(0, 8).replace(/[^a-zA-Z0-9]/g, '');
const name = `${notebookName ? `${notebookName}-` : ''}content-${hash}.html`;
// Debug: Log what's being extracted as HTML
if (verbose || extractAssets) {
console.log('Extracting HTML asset:', { name, dataLength: source.length, sizeInBytes, startsWithDoctype, startsWithHtml, isLargeEnough, hash });
}
assetsToWrite.push({
placeholderName: name,
data: source,
encoding: 'utf8',
type: 'text/html',
notebookPrefix: notebookName ? `${notebookName}-` : ''
});
source = `<iframe src="ASSET_PLACEHOLDER_${name}" width="100%" height="400px"></iframe>`;
}
} else if (keys.includes("application/javascript")) {
const data = source["data"]["application/javascript"];
// Calculate size in bytes for JS content
const jsContent = Array.isArray(data) ? data.join("") : data;
const sizeInBytes = new TextEncoder().encode(jsContent).length;
const sizeThreshold = 100 * 1024; // 100KB
const isLargeEnough = sizeInBytes > sizeThreshold;
if (shouldExtract('application/javascript') && typeof process !== "undefined" && isLargeEnough) {
const hash = jsContent.toString().substring(0, 8).replace(/[^a-zA-Z0-9]/g, '');
const name = `${notebookName ? `${notebookName}-` : ''}script-${hash}.js`;
// Debug: Log what's being extracted as JS
if (verbose || extractAssets) {
console.log('Extracting JS asset:', { name, sizeInBytes, isLargeEnough });
}
assetsToWrite.push({ placeholderName: name, data: jsContent, encoding: 'utf8', type: 'application/javascript' });
source = `<script src="ASSET_PLACEHOLDER_${name}"></script>`;
} else {
source = "<script>" + jsContent + "</script>";
}
} else {
// Check for images first, then fall back to text/plain if no image found
const imageKey = keys.filter(key => key.startsWith('image/'))[0];
if (imageKey && source["data"][imageKey]) {
const data = source["data"][imageKey];
// Debug logging for image processing
if (verbose || extractAssets) {
console.log('Image processing debug:', {
imageKey,
dataType: typeof data,
dataLength: data?.length,
shouldExtractResult: shouldExtract(imageKey),
processEnv: typeof process !== "undefined"
});
}
// Additional check to make sure this is actually image data
if (typeof data === 'string' && data.length > 50) { // Basic sanity check for image data
const imageType = imageKey.split('/')[1]; // Extract format (png, jpeg, gif, svg+xml, etc.)
if (shouldExtract(imageKey) && typeof process !== "undefined") {
// Use simple index-based naming instead of complex unique ID
imageIndex++;
// Handle special cases for file extensions
let extension = imageType;
if (imageType === 'jpeg') extension = 'jpg';
if (imageType === 'svg+xml') extension = 'svg';
const name = `${notebookName ? `${notebookName}-` : ''}image-${imageIndex}.${extension}`;
const encoding = imageType === 'svg+xml' ? 'utf8' : 'base64';
// Debug: Log what's being extracted as image
if (verbose || extractAssets) {
console.log('Extracting image asset:', {
name,
imageType,
extension,
encoding,
dataLength: data.length,
imageIndex
});
}
assetsToWrite.push({
placeholderName: name,
data: data,
encoding: encoding,
type: imageKey,
notebookPrefix: notebookName ? `${notebookName}-` : ''
});
source = `<img src="ASSET_PLACEHOLDER_${name}" alt="Image Alt Text">`;
} else {
if (verbose || extractAssets) {
console.log('Image not extracted - inline instead:', {
shouldExtract: shouldExtract(imageKey),
processUndefined: typeof process === "undefined"
});
}
source = `<img src="data:${imageKey};base64,${data}" alt="Image Alt Text">`;
}
} else {
// If we reach here, there was an image key but no valid image data
if (verbose || extractAssets) {
console.log('Found image key but invalid data:', { imageKey, dataType: typeof data, dataLength: data?.length });
}
source = "";
}
} else if (keys.includes("text/plain")) {
const data = source["data"]["text/plain"];
// Always keep text/plain inline, don't extract to separate files
source = !/<Figure/.test(data) ? (Array.isArray(data) ? data.join('') : data) : "";
} else {
// No recognized content type found
if (verbose || extractAssets) {
console.log('No recognized content type found:', { keys, availableData: Object.keys(source["data"]) });
}
source = "";
}
}
for (let lbl of flags) {
try {
source = source.replaceAll(lbl + "\r\n", "");
source = source.replaceAll(lbl + "\n", "");
} catch {
verbose && console.log("ERROR: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!processOutput... ", typeof source, source);
}
if (lbl == "#collapse_output_open") {
source = makeDetails(source, true, 'output');
}
if (lbl == "#collapse_output") {
source = makeDetails(source, false, 'output');
}
if (lbl == "#hide_output") {
source = "";
}
if (lbl == "#hide") {
source = "";
}
}
return source;
//output_type == 'stream' ==> text
//output_type == 'display_data' ==> data{'application/javascript' or 'text/html' or 'execute_result'}
}
export { nb2json, get_metadata, convertNb }