Source: smart-chunks/SmartMarkdown.js

/**
 * SmartMarkdown is a class designed to parse and manipulate markdown content
 * based on specified configurations. It provides functionalities to extract
 * specific blocks of text based on heading paths, handle exclusions of certain
 * headings, and manage the size of text blocks according to maximum and minimum
 * character limits.
 */
class SmartMarkdown {
  /**
   * Returns the default configuration for the SmartMarkdown parser.
   * @returns {Object} Default configuration settings.
   */
  static get defaults() {
    return {
      excluded_headings: null, // comma separated list of headings to exclude
      embed_input_max_chars: 1000, // max length of block
      embed_input_min_chars: 10, // min length of block
      skip_blocks_with_headings_only: false, // skip blocks that only contain headings
    };
  }
  /**
   * Creates an instance of SmartMarkdown with the given configuration.
   * @param {Object} config - User-defined configuration settings.
   */
  constructor(config) {
    this.config = {...SmartMarkdown.defaults, ...config};
  }
  /**
   * Retrieves the list of headings to be excluded from parsing, if any.
   * @returns {Array|null} An array of headings to exclude, or null if none.
   */
  get excluded_headings() { (this.config.excluded_headings?.length) ? this.config.excluded_headings.split(",").map((header) => header.trim()) : null; }
  /**
   * Analyzes the markdown content to extract metadata about each heading.
   * @param {string} content - The markdown content to analyze.
   * @returns {Array} An array of objects containing metadata about each heading.
   */  
  // WIP
  get_headings_meta(content) {
    return content.split('\n').reduce((acc, line, line_i, lines) => {
      if (!this.is_heading(line)) return acc;
      const chars_until_next_heading = lines.slice(line_i + 1).findIndex(line => this.is_heading(line));
      const heading_level = line.split('#').length - 1;
      const heading_text = line.replace(/#/g, '').trim();
      acc.push = { line_i, heading_level, heading_text, chars_until_next_heading };
      return acc;
    }, []);
  }

  // v1
  // get block from path
  /**
   * Extracts a specific block of markdown based on a heading path.
   * @param {string} block_path - The path to the block, specified as a series of headings.
   * @param {string} markdown - The markdown content to parse.
   * @param {Object} opts - Options for block extraction, such as character limits per line.
   * @returns {string} The extracted block of markdown text.
   */
  get_block_from_path(block_path, markdown, opts={}){
    // if block_path ends with # and only one # then returns content prior to first heading
    if(block_path.endsWith('#') && block_path.split('#').length === 2) return markdown.split('#')[0];
    if(!this.validate_block_path(block_path)) return markdown;
    const {
      chars_per_line = null,
      max_chars = this.config.embed_input_max_chars,
      min_chars = this.config.embed_input_min_chars,
    } = opts;
    const block = [];
    const block_headings = block_path.split("#").slice(1);
    let currentHeaders = [];
    let begin_line = 0;
    let is_code = false;
    let char_count = 0;
    let heading_occurrence = 0;
    let occurrence_count = 0;
    if(block_headings[block_headings.length-1].indexOf('{') > -1) {
      heading_occurrence = parseInt(block_headings[block_headings.length-1].split('{')[1].replace('}', '')); // get the occurrence number
      block_headings[block_headings.length-1] = block_headings[block_headings.length-1].split('{')[0]; // remove the occurrence from the last heading
    }
    const lines = markdown.split('\n');
    let block_heading_level = 0;
    // FIND HEADING
    for (let i = 0; i < lines.length; i++) {
      const line = lines[i];
      if(line.indexOf('```') === 0) is_code = !is_code; // if line begins with three backticks then toggle is_code
      if(is_code) continue; // if is_code is true then add line with preceding tab and continue
      if(['- ', '- [ ] '].indexOf(line) > -1) continue; // skip if line is empty bullet or checkbox
      if (!this.is_heading(line)) continue; // skip if line is not a heading
      block_heading_level = this.heading_level(line); // get the heading 'level'
      const heading_text = line.replace(/#/g, '').trim(); // get the heading text
      const heading_index = block_headings.indexOf(heading_text);
      if (heading_index < 0) continue; // continue if heading text is not in block_headings
      if (currentHeaders.length !== heading_index) continue; // if currentHeaders.length !== heading_index then we have a mismatch
      currentHeaders.push(heading_text); // push the heading text to the currentHeaders array
      if (currentHeaders.length === block_headings.length) { // if currentHeaders.length === block_headings.length then we have a match
        if(heading_occurrence === 0){
          begin_line = i + 1;
          break; // break out of loop
        }
        if(occurrence_count === heading_occurrence){
          begin_line = i + 1;
          break; // break out of loop
        }
        occurrence_count++; // increment occurrence_count
        currentHeaders.pop(); // reset currentHeaders
        continue;
      }
    }
    // BUILD BLOCK
    if (begin_line === 0) return ''; // if no begin_line then heading not found
    is_code = false; // iterate through lines starting at begin_line
    for (let i = begin_line; i < lines.length; i++) {
      let line = lines[i];
      // if(line.trim().length === 0) continue; // if line is empty, skip // DO: make this configurable
      if(this.is_heading(line) && (this.heading_level(line) <= block_heading_level)) break; // if line is a heading and heading_level is less than or equal to block_heading_level, break
      // validate/format
      if (chars_per_line && (line.length > chars_per_line)) line = line.slice(0, chars_per_line) + "..."; // limit length of line to N characters
      if (line.startsWith("```")) is_code = !is_code; // if line is a code block, skip
      block.push(line); // add line to block
      char_count += line.length; // increment char_count
      if(max_chars && (char_count > max_chars)){
        const diff = char_count - max_chars;
        block[block.length-1] = block[block.length-1].slice(0, block[block.length-1].length - diff) + "...";
        break; // break if char_count is greater than max_chars
      }
      if(max_chars && (max_chars - char_count < 10)) break; // break if max_chars - char_count is less than threshold
    }
    if (is_code) block.push("```"); // close code block if open
    return block.join("\n").trim();
  }
  /**
   * Parses the markdown content and organizes it into structured blocks based on headings.
   * @param {Object} params - Parameters containing content and optional file path.
   * @returns {Object} An object containing parsed blocks and other metadata.
   */
  parse({ content, file_path='' }) {
    // const file_breadcrumbs = this.file_path_to_breadcrumbs(file_path) + ": "; // add ":" to indicate beginning of heading breadcrumbs
    const file_breadcrumbs = this.file_path_to_breadcrumbs(file_path); // add ":" to indicate beginning of heading breadcrumbs
    // if is excalidraw file, block for 'Text Elements' heading only
    if(file_path.endsWith('.excalidraw.md')) {
      const excalidraw_block = this.get_block_from_path(file_path + "#Text Elements", content).replace('\n%%', '');
      return {
        blocks: [
          {
            text: excalidraw_block,
            path: file_path + "#Text Elements",
            length: excalidraw_block.length,
            heading: "Text Elements",
          }
        ],
        log: [],
      };
    }

    const output = content.split('\n') // split the markdown into lines
      .reduce((acc, line, i, arr) => {
        // if line is a heading or last line
        if(this.is_heading(line) && (!acc.curr_level || !this.config.multi_heading_blocks || (this.heading_level(line) <= acc.curr_level) || (acc.curr.length > this.config.embed_input_max_chars))){
          this.output_block(acc);
          acc.curr_level = this.heading_level(line); // get the heading 'level'
          acc.current_headers = acc.current_headers.filter(header => header.level < acc.curr_level); // remove any headers from the current headers array that are higher than the current header level
          acc.current_headers.push({ header: line.replace(/#/g, '').trim(), level: acc.curr_level }); // add header and level to current headers array, trim the header to remove "#" and any trailing spaces
          acc.start_line = i; // set the start line
          acc.curr = file_breadcrumbs; // initialize the block breadcrumbs with file.path the current headers
          if(acc.current_headers.length > 0) acc.curr += ": " + acc.current_headers.map(header => header.header).join(' > ');
          acc.block_headings = "#" + acc.current_headers.map(header => header.header).join('#');
          this.handle_duplicate_headings(acc);
          acc.block_headings_list.push(acc.block_headings);
          acc.block_path = file_path + acc.block_headings;
          acc.curr_heading = line.replace(/#/g, '').trim();
          return acc;
        }
        // if line is not a heading, add line to current block
        if(this.is_content_line(line)){
          if(acc.curr.indexOf("\n") === -1) acc.curr += ":"; // add ":" to indicate end of heading breadcrumbs
          acc.curr += "\n" + line; // filter out empty lines and bullets
          acc.curr_line = i; // set the current line
        }
        if (i === arr.length - 1) this.output_block(acc); // if last line, output the block
        return acc;
      }, { block_headings: '', block_headings_list: [], block_path: file_path + "#", curr: file_breadcrumbs, current_headers: [], blocks: [], log: [], start_line: 0, curr_line: 0, curr_heading: null })
    ;
    return {
      ...output,
      file_path,
      // remove properties that are exclusive to the reduce function
      block_headings: undefined,
      block_headings_list: undefined,
      block_path: undefined,
      curr: undefined,
      current_headers: undefined,
    };
  }
  /**
   * Handles duplicate headings by appending a unique identifier to the heading path.
   * @param {Object} acc - The accumulator object used in reduce function.
   */
  // if block_headings is already in block_headings_list then add a number to the end
  handle_duplicate_headings(acc) {
    if (!acc.block_headings_list.includes(acc.block_headings)) return; // if block_headings is not in block_headings_list then return
    let count = 1;
    const uniqueHeadings = new Set(acc.block_headings_list);
    while (uniqueHeadings.has(`${acc.block_headings}{${count}}`)) { count++; }
    acc.block_headings = `${acc.block_headings}{${count}}`;
  }
  /**
   * Outputs the current block into the structured blocks array after validation.
   * @param {Object} acc - The accumulator object used in reduce function.
   */
  // push the current block to the blocks array
  output_block(acc) {
    const { embed_input_max_chars, embed_input_min_chars } = this.config;
    if(acc.curr.indexOf("\n") === -1) return acc.log.push(`Skipping empty block: ${acc.curr}`); // indicated by no newlines in block
    if(!this.validate_heading(acc.block_headings)) return acc.log.push(`Skipping excluded heading: ${acc.block_headings}`);
    if(acc.curr.length > embed_input_max_chars) acc.curr = acc.curr.substring(0, embed_input_max_chars); // trim block to max length
    const text = acc.curr.replace(/\r\n/g, '\n').trim();
    const pcs = text.split('\n');
    const block_length = pcs.slice(1).join('\n').trim().length;
    const breadcrumbs_length = acc.curr.indexOf("\n") + 1; // breadcrumbs length (first line of block)
    // const block_length = acc.curr.length - breadcrumbs_length;
    if(block_length < embed_input_min_chars) return acc.log.push(`Skipping block shorter than min length: ${acc.curr}`); // skip if block is shorter than min length
    if(this.config.skip_blocks_with_headings_only){ // skip if all lines are headings (except first line which is breadcrumbs)
      const block_lines = text.split('\n');
      const block_headings = block_lines.slice(1).filter(line => this.is_heading(line));
      if(block_headings.length === block_lines.length - 1) return acc.log.push(`Skipping block with only headings: ${acc.curr}`);
    }
    acc.blocks.push({
      text,
      path: acc.block_path,
      length: block_length,
      heading: acc.curr_heading,
      lines: [acc.start_line, acc.curr_line],
    }); // add block to blocks array
  }
  /**
   * Determines if a line of text should be considered as content.
   * @param {string} line - The line of text to evaluate.
   * @returns {boolean} True if the line is content, false otherwise.
   */
  is_content_line(line) {
    // if (line === '') return false; // skip if line is empty // DO: make this configurable
    if (['- ', '- [ ] '].indexOf(line) > -1) return false; // skip if line is empty bullet or checkbox
    return true;
  }
  /**
   * Converts a file path to a breadcrumb string format.
   * @param {string} file_path - The file path to convert.
   * @returns {string} The breadcrumb string.
   */
  file_path_to_breadcrumbs(file_path) { return file_path.replace('.md', '').split('/').map(crumb => crumb.trim()).filter(crumb => crumb !== '').join(' > '); } // remove .md file extension and convert file_path to breadcrumb formatting
  /**
   * Determines the level of a heading based on the number of '#' characters.
   * @param {string} line - The heading line to evaluate.
   * @returns {number} The level of the heading.
   */  
  heading_level(line) { return line.split('#').length - 1; }
  /**
   * Checks if a line is a heading.
   * @param {string} line - The line to check.
   * @returns {boolean} True if the line is a heading, false otherwise.
   * @param {string} line - The line to check.
   * @returns {boolean} True if the line is a heading, false otherwise.
  */
  is_heading(line) { return line.startsWith('#') && (['#', ' '].indexOf(line[1]) > -1); } // check if line is a heading (starts with # and second character is space or # indicating not a tag)
  /**
   * Validates if the block path is correctly formatted to include at least one heading.
   * @param {string} block_path - The block path to validate.
   * @returns {boolean} True if the block path is valid, false otherwise.
   */
  validate_block_path(block_path) { return block_path.indexOf("#") > -1; } // validate block_path contains at least one "#"
  /**
   * Validates a heading against the list of excluded headings.
   * @param {string} headings - The heading to validate.
   * @returns {boolean} True if the heading is not excluded, false if it is.
   */  
  validate_heading(headings) { return !!!this.excluded_headings?.some(exclusion => headings.indexOf(exclusion) > -1); } // validate heading against excluded headings

}
exports.SmartMarkdown = SmartMarkdown;