stedolan-jq/docs/build_manpage.py

#!/usr/bin/env python3
from datetime import date
from io import StringIO
from lxml import etree
import markdown
from markdown.extensions import Extension
import re
import sys
import yaml

# Prevent our markdown parser from trying to help by interpreting things in angle brackets as HTML tags.
class EscapeHtml(Extension):
  def extendMarkdown(self, md, md_globals):
    del md.preprocessors['html_block']
    del md.inlinePatterns['html']

class RoffWalker(object):
  def __init__(self, tree, output=sys.stdout):
    self.tree = tree
    self.target = output
    self.f = StringIO()

  def walk(self):
    self._walk(self.tree, parent_tag=None)
    # We don't want to start lines with \. because that can confuse man
    # For lines that start with \., we need to prefix them with \& so it
    # knows not to treat that line as a directive
    data = re.sub(r'^\\\.', r'\&.', self.f.getvalue(), flags=re.MULTILINE)
    self.target.write(data)

  def _ul_is_special(self, root):
    if len(root) != 1:
      return False
    child = root[0]
    if child.tag != 'li':
      return False
    msg = ''.join(child.itertext()).strip()
    return msg.endswith(':')

  def _walk_child(self, root):
    if len(root) > 0:
      self._walk(root[0], parent_tag=root.tag)

  def _write_element(self, root, ensure_newline=True):
    if root.text is not None:
      text = self._sanitize(root.text)
      self.__write_raw(text)
    self._walk_child(root)
    self._write_tail(root, ensure_newline=ensure_newline)

  def _write_tail(self, root, ensure_newline=False, inline=False):
    if root.tail is not None:
      if inline or root.tail != '\n':
        text = self._sanitize(root.tail)
        if text.endswith('\n'):
          ensure_newline = False
        self.__write_raw(text)
    if ensure_newline:
      self.__write_raw('\n')

  def _walk(self, root, parent_tag=None):
    last_tag = None
    while root is not None:
      if root.tag == 'h1':
        self.__write_cmd('.TH "JQ" "1" "{}" "" ""'.format(date.today().strftime('%B %Y')))
        self.__write_cmd('.SH "NAME"')
        # TODO: properly parse this
        self.__write_raw(r'\fBjq\fR \- Command\-line JSON processor' + "\n")

      elif root.tag == 'h2':
        self.__write_cmd('.SH "{}"'.format(''.join(root.itertext()).strip()))

      elif root.tag == 'h3':
        text = ''.join(root.itertext()).strip()
        self.__write_cmd('.SS "{}"'.format(self._h3_sanitize(text)))

      elif root.tag == 'p':
        if last_tag not in ['h2', 'h3'] and parent_tag not in ['li']:
          self.__write_cmd('.P')
        self._write_element(root, ensure_newline=(parent_tag != 'li'))

      elif root.tag == 'ul':
        if self._ul_is_special(root):
          li = root[0]
          self.__write_cmd('.TP')
          self._write_element(li)
          next = root.getnext()
          while next is not None and next.tag == 'p':
            if next.getnext() is not None and next.getnext().tag == 'pre':
              # we don't want to .IP these, because it'll look funny with the code indent
              break
            self.__write_cmd('.IP')
            self._write_element(next)
            root = next
            next = root.getnext()
        else:
          self._walk_child(root)
          self._write_tail(root)
          # A pre tag after the end of a list doesn't want two of the indentation commands
          if root.getnext() is None or root.getnext().tag != 'pre':
            self.__write_cmd('.IP "" 0')

      elif root.tag == 'li':
        self.__write_cmd(r'.IP "\(bu" 4')
        if root.text is not None and root.text.strip() != '':
          text = self._sanitize(root.text)
          self.__write_raw(text)
        self._walk_child(root)
        self._write_tail(root, ensure_newline=True)

      elif root.tag == 'strong':
        if root.text is not None:
          text = self._sanitize(root.text)
          self.__write_raw('\\fB{}\\fR'.format(text))

        self._write_tail(root, inline=True)

      elif root.tag == 'em':
        if root.text is not None:
          text = self._sanitize(root.text)
          self.__write_raw('\\fI{}\\fR'.format(text))
        self._write_tail(root, inline=True)

      elif root.tag == 'code':
        if root.text is not None:
          text = self._code_sanitize(root.text)
          self.__write_raw('\\fB{}\\fR'.format(text))
        self._write_tail(root, inline=True)

      elif root.tag == 'pre':
        self.__write_cmd('.IP "" 4')
        self.__write_cmd('.nf\n') # extra newline for spacing reasons
        next = root
        first = True
        while next is not None and next.tag == 'pre':
          if not first:
            self.__write_raw('\n')
          text = ''.join(next.itertext(with_tail=False))
          self.__write_raw(self._pre_sanitize(text))
          first = False
          root = next
          next = next.getnext()
        self.__write_cmd('.fi')
        self.__write_cmd('.IP "" 0')

      else:
        self._walk_child(root)

      last_tag = root.tag
      root = root.getnext()

  def _base_sanitize(self, text):
    text = re.sub(r'\\', r'\\e', text)
    text = re.sub(r'\.', r'\\.', text)
    text = re.sub("'", r"\'", text)
    text = re.sub('-', r'\-', text)
    return text

  def _pre_sanitize(self, text):
    return self._base_sanitize(text)

  def _code_sanitize(self, text):
    text = self._base_sanitize(text)
    text = re.sub(r'\s', ' ', text)
    return text

  def _h3_sanitize(self, text):
    text = self._base_sanitize(text)
    text = re.sub(' \n|\n ', ' ', text)
    text = re.sub('\n', ' ', text)
    return text

  def _sanitize(self, text):
    text = self._base_sanitize(text)
    text = re.sub(r'<([^>]+)>', r'\\fI\1\\fR', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('\n', ' ', text)
    return text

  def __write_cmd(self, dat):
    print('.', dat, sep='\n', file=self.f)
    pass

  def __write_raw(self, dat):
    print(dat, sep='', end='', file=self.f)
    pass

def load_yml_file(fn):
  with open(fn) as f:
    return yaml.safe_load(f)

def dedent_body(body):
  lines = [re.sub(r'^  (\S)', r'\1', l) for l in body.split('\n')]
  return '\n'.join(lines)

def convert_manual_to_markdown():
  f = StringIO()
  manual = load_yml_file("content/manual/manual.yml")
  f.write(manual.get('manpage_intro', '\n'))
  f.write(dedent_body(manual.get('body', '\n')))
  for section in manual.get('sections', []):
    f.write('## {}\n'.format(section.get('title', '').upper()))
    f.write(dedent_body(section.get('body', '\n')))
    f.write('\n')
    for entry in section.get('entries', []):
      f.write('### {}\n'.format(entry.get('title', '')))
      f.write(dedent_body(entry.get('body', '\n')))
      f.write('\n')
      if entry.get('examples') is not None:
        f.write("~~~~\n")
        first = True
        for example in entry.get('examples'):
          if not first:
            f.write('\n')
          f.write("jq '{}'\n".format(example.get('program', '')))
          f.write("   {}\n".format(example.get('input', '')))
          output = [str(x) for x in example.get('output', [])]
          f.write("=> {}\n".format(', '.join(output)))
          first = False
        f.write("~~~~\n")
    f.write('\n')
  f.write(manual.get('manpage_epilogue', ''))
  return f.getvalue()

# Convert manual.yml to our special markdown format
markdown_data = convert_manual_to_markdown()

# Convert markdown to html
html_data = markdown.markdown(markdown_data, extensions=[EscapeHtml(), 'fenced_code'])

# Parse the html into a tree so we can walk it
tr = etree.HTML(html_data, etree.HTMLParser())

# Convert the markdown to ROFF
RoffWalker(tr).walk()