Source code for cmakelang.markup

# -*- coding: utf-8 -*-
"""
Functions for parsing comments in markup
"""
from __future__ import unicode_literals
import math
import textwrap
import re
from cmakelang import common

# Matches comment strings like ``# TODO(josh):`` or ``# NOTE(josh):``
NOTE_REGEX = re.compile(r'^\s*[A-Z_]+\([^)]+\):.*')

# Matches comment lines that are clearly meant to separate sections or
# headers. The meaning of this regex is "a line consisting of three or more
# non-word characters ending with three or more non-word characters"
RULER_PATTERN = r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
RULER_REGEX = re.compile(RULER_PATTERN)

# Matches lines that start a bulleted list
BULLET_REGEX = re.compile(r'^(\s*)([\*-])( .+)$')

# Matches lines that start an itemized list
ENUM_REGEX = re.compile(r'^(\s*)\d+([.:])( .+)$')

# Matches a verbatim fence
FENCE_PATTERN = r'^\s*([`~]{3}[`~]*)(.*)$'
FENCE_REGEX = re.compile(FENCE_PATTERN)


[docs]class CommentType(common.EnumObject): _id_map = {}
CommentType.PARAGRAPH = CommentType(0) CommentType.BULLET_LIST = CommentType(1) CommentType.ENUM_LIST = CommentType(2) CommentType.NOTE = CommentType(3) CommentType.RULER = CommentType(4) CommentType.SEPARATOR = CommentType(5) CommentType.FENCE = CommentType(6) CommentType.VERBATIM = CommentType(7)
[docs]class CommentItem(object): def __init__(self, kind): self.kind = kind self.indent = None self.lines = [] def __repr__(self): return "{}".format(self.kind.name)
[docs]def parse(lines, config=None): """ Parse comment lines. Returns objects of different formatable entities """ # pylint: disable=too-many-statements obj_list = [] state = None bullet_regex = None if config is None: fence_re = FENCE_REGEX ruler_re = RULER_REGEX else: fence_re = re.compile(config.markup.fence_pattern) ruler_re = re.compile(config.markup.ruler_pattern) for line in lines: fence_match = fence_re.match(line) if fence_match: obj_list.append(CommentItem(CommentType.FENCE)) obj_list[-1].lines.append(fence_match.group(1).strip()) content = fence_match.group(2).strip() line = content if state == CommentType.VERBATIM: state = None else: obj_list.append(CommentItem(CommentType.VERBATIM)) state = CommentType.VERBATIM if not line: continue if state == CommentType.VERBATIM: if line and line[0] == ' ': obj_list[-1].lines.append(line[1:]) else: obj_list[-1].lines.append(line) continue if not line: if state is CommentType.SEPARATOR: continue obj_list.append(CommentItem(CommentType.SEPARATOR)) state = CommentType.SEPARATOR continue if ruler_re.match(line): obj_list.append(CommentItem(CommentType.RULER)) obj_list[-1].lines.append(line.strip()) state = CommentType.RULER continue if state in (None, CommentType.SEPARATOR, CommentType.RULER): match = BULLET_REGEX.match(line) if match: obj_list.append(CommentItem(CommentType.BULLET_LIST)) indent_str = match.group(1) bullet_punctuation = match.group(2) obj_list[-1].lines.append(match.group(3)) obj_list[-1].indent = len(indent_str) state = CommentType.BULLET_LIST if bullet_punctuation == '*': bullet_punctuation = r'\*' bullet_regex = re.compile( '^{}{}( .*)$'.format(indent_str, bullet_punctuation)) continue match = ENUM_REGEX.match(line) if match: obj_list.append(CommentItem(CommentType.ENUM_LIST)) indent_str = match.group(1) bullet_punctuation = match.group(2) obj_list[-1].lines.append(match.group(3).strip()) obj_list[-1].indent = len(indent_str) state = CommentType.ENUM_LIST # TODO(josh) We want to match lines with either the same number of # spaces or with the colon in the same column bullet_regex = re.compile( r'^{}\d+{}( .*)$'.format(indent_str, bullet_punctuation)) continue if NOTE_REGEX.match(line): obj_list.append(CommentItem(CommentType.NOTE)) obj_list[-1].lines.append(line.strip()) state = CommentType.NOTE continue if ruler_re.match(line): obj_list.append(CommentItem(CommentType.RULER)) obj_list[-1].lines.append(line.strip()) state = CommentType.RULER continue state = CommentType.PARAGRAPH obj_list.append(CommentItem(CommentType.PARAGRAPH)) obj_list[-1].lines.append(line.strip()) elif state in (CommentType.PARAGRAPH, CommentType.NOTE): if NOTE_REGEX.match(line): obj_list.append(CommentItem(CommentType.NOTE)) state = CommentType.NOTE elif RULER_REGEX.match(line): obj_list.append(CommentItem(CommentType.RULER)) state = CommentType.RULER obj_list[-1].lines.append(line.strip()) elif state in (CommentType.BULLET_LIST, CommentType.ENUM_LIST): match = bullet_regex.match(line) if match: obj_list[-1].lines.append(match.group(1).strip()) else: obj_list[-1].lines[-1] += '\n' + line.strip() return obj_list
[docs]def is_hashruler(item): """ Return true if the markup item is a hash ruler, i.e.:: ########################### # Like this ^^^ or this vvv ########################### """ if item.kind != CommentType.RULER: return False if len(item.lines) != 1: return False if item.lines[0].strip('#'): return False return True
COMMON_KWARGS = dict( expand_tabs=True, replace_whitespace=True, drop_whitespace=True, break_long_words=False, break_on_hyphens=False )
[docs]def format_item(config, line_width, item): """ Return lines of formatted text based on the typeof markup """ if item.kind == CommentType.SEPARATOR: return [''] if item.kind == CommentType.FENCE: return ['~~~'] if item.kind == CommentType.VERBATIM: return [line.rstrip() for line in item.lines] if is_hashruler(item) and config.markup.canonicalize_hashrulers: return ['#' * line_width] if item.kind in (CommentType.PARAGRAPH, CommentType.NOTE, CommentType.RULER): wrapper = textwrap.TextWrapper(width=line_width, **COMMON_KWARGS) return common.stable_wrap(wrapper, '\n'.join(item.lines).strip()) if item.kind == CommentType.BULLET_LIST: assert line_width > 2 outlines = [] wrapper = textwrap.TextWrapper(width=line_width - 2, **COMMON_KWARGS) for line in item.lines: increment_lines = common.stable_wrap(wrapper, line.strip()) outlines.append(config.markup.bullet_char + ' ' + increment_lines[0]) outlines.extend(' ' + iline for iline in increment_lines[1:]) return outlines if item.kind == CommentType.ENUM_LIST: assert line_width > 2 outlines = [] wrapper = textwrap.TextWrapper(width=line_width - 2, **COMMON_KWARGS) digits = int(math.ceil(math.log(len(item.lines), 10))) fmt = '{:%dd}%s ' % (digits, config.markup.enum_char) indent = ' ' * (digits + 2) for idx, line in enumerate(item.lines): increment_lines = common.stable_wrap(wrapper, line.strip()) outlines.append(fmt.format(idx + 1) + increment_lines[0]) outlines.extend(indent + iline for iline in increment_lines[1:]) return outlines raise AssertionError('Unexepected case')
[docs]def format_items(config, line_width, items): """ Return lines of formatted text for the sequence of items within a comment block """ outlines = [] indent_history = [] for item in items: if item.kind in (CommentType.BULLET_LIST, CommentType.ENUM_LIST): while indent_history and indent_history[-1] >= item.indent: indent_history.pop(-1) indent_history.append(item.indent) nindent = 2 * (len(indent_history) - 1) ilines = format_item(config, line_width - nindent, item) outlines.extend(' ' * nindent + iline for iline in ilines) else: outlines.extend(format_item(config, line_width, item)) if item.kind != CommentType.SEPARATOR: indent_history = [] return outlines