#!/usr/bin/env python3 # # Syntax: mkdoc.py [-I ..] [.. a list of header files ..] # # Extract documentation from C++ header files to use it in Python bindings # import os import sys import platform import re import textwrap from clang import cindex from clang.cindex import CursorKind from collections import OrderedDict from glob import glob from threading import Thread, Semaphore from multiprocessing import cpu_count RECURSE_LIST = [ CursorKind.TRANSLATION_UNIT, CursorKind.NAMESPACE, CursorKind.CLASS_DECL, CursorKind.STRUCT_DECL, CursorKind.ENUM_DECL, CursorKind.CLASS_TEMPLATE ] PRINT_LIST = [ CursorKind.CLASS_DECL, CursorKind.STRUCT_DECL, CursorKind.ENUM_DECL, CursorKind.ENUM_CONSTANT_DECL, CursorKind.CLASS_TEMPLATE, CursorKind.FUNCTION_DECL, CursorKind.FUNCTION_TEMPLATE, CursorKind.CONVERSION_FUNCTION, CursorKind.CXX_METHOD, CursorKind.CONSTRUCTOR, CursorKind.FIELD_DECL ] PREFIX_BLACKLIST = [ CursorKind.TRANSLATION_UNIT ] CPP_OPERATORS = { '<=': 'le', '>=': 'ge', '==': 'eq', '!=': 'ne', '[]': 'array', '+=': 'iadd', '-=': 'isub', '*=': 'imul', '/=': 'idiv', '%=': 'imod', '&=': 'iand', '|=': 'ior', '^=': 'ixor', '<<=': 'ilshift', '>>=': 'irshift', '++': 'inc', '--': 'dec', '<<': 'lshift', '>>': 'rshift', '&&': 'land', '||': 'lor', '!': 'lnot', '~': 'bnot', '&': 'band', '|': 'bor', '+': 'add', '-': 'sub', '*': 'mul', '/': 'div', '%': 'mod', '<': 'lt', '>': 'gt', '=': 'assign', '()': 'call' } CPP_OPERATORS = OrderedDict( sorted(CPP_OPERATORS.items(), key=lambda t: -len(t[0]))) job_count = cpu_count() job_semaphore = Semaphore(job_count) class NoFilenamesError(ValueError): pass def d(s): return s if isinstance(s, str) else s.decode('utf8') def sanitize_name(name): name = re.sub(r'type-parameter-0-([0-9]+)', r'T\1', name) for k, v in CPP_OPERATORS.items(): name = name.replace('operator%s' % k, 'operator_%s' % v) name = re.sub('<.*>', '', name) name = ''.join([ch if ch.isalnum() else '_' for ch in name]) name = re.sub('_$', '', re.sub('_+', '_', name)) return '__doc_' + name def process_comment(comment): result = '' # Remove C++ comment syntax leading_spaces = float('inf') for s in comment.expandtabs(tabsize=4).splitlines(): s = s.strip() if s.startswith('/*'): s = s[2:].lstrip('*') elif s.endswith('*/'): s = s[:-2].rstrip('*') elif s.startswith('///'): s = s[3:] if s.startswith('*'): s = s[1:] if len(s) > 0: leading_spaces = min(leading_spaces, len(s) - len(s.lstrip())) result += s + '\n' if leading_spaces != float('inf'): result2 = "" for s in result.splitlines(): result2 += s[leading_spaces:] + '\n' result = result2 # Doxygen tags cpp_group = r'([\w:]+)' param_group = r'([\[\w:\]]+)' s = result s = re.sub(r'\\c\s+%s' % cpp_group, r'``\1``', s) s = re.sub(r'\\a\s+%s' % cpp_group, r'*\1*', s) s = re.sub(r'\\e\s+%s' % cpp_group, r'*\1*', s) s = re.sub(r'\\em\s+%s' % cpp_group, r'*\1*', s) s = re.sub(r'\\b\s+%s' % cpp_group, r'**\1**', s) s = re.sub(r'\\ingroup\s+%s' % cpp_group, r'', s) s = re.sub(r'\\param%s?\s+%s' % (param_group, cpp_group), r'\n\n$Parameter ``\2``:\n\n', s) s = re.sub(r'\\tparam%s?\s+%s' % (param_group, cpp_group), r'\n\n$Template parameter ``\2``:\n\n', s) for in_, out_ in { 'return': 'Returns', 'author': 'Author', 'authors': 'Authors', 'copyright': 'Copyright', 'date': 'Date', 'remark': 'Remark', 'sa': 'See also', 'see': 'See also', 'extends': 'Extends', 'throw': 'Throws', 'throws': 'Throws' }.items(): s = re.sub(r'\\%s\s*' % in_, r'\n\n$%s:\n\n' % out_, s) s = re.sub(r'\\details\s*', r'\n\n', s) s = re.sub(r'\\brief\s*', r'', s) s = re.sub(r'\\short\s*', r'', s) s = re.sub(r'\\ref\s*', r'', s) s = re.sub(r'\\code\s?(.*?)\s?\\endcode', r"```\n\1\n```\n", s, flags=re.DOTALL) # HTML/TeX tags s = re.sub(r'(.*?)', r'``\1``', s, flags=re.DOTALL) s = re.sub(r'
(.*?)
', r"```\n\1\n```\n", s, flags=re.DOTALL) s = re.sub(r'(.*?)', r'*\1*', s, flags=re.DOTALL) s = re.sub(r'(.*?)', r'**\1**', s, flags=re.DOTALL) s = re.sub(r'\\f\$(.*?)\\f\$', r'$\1$', s, flags=re.DOTALL) s = re.sub(r'
  • ', r'\n\n* ', s) s = re.sub(r'', r'', s) s = re.sub(r'
  • ', r'\n\n', s) s = s.replace('``true``', '``True``') s = s.replace('``false``', '``False``') # Re-flow text wrapper = textwrap.TextWrapper() wrapper.expand_tabs = True wrapper.replace_whitespace = True wrapper.drop_whitespace = True wrapper.width = 70 wrapper.initial_indent = wrapper.subsequent_indent = '' result = '' in_code_segment = False for x in re.split(r'(```)', s): if x == '```': if not in_code_segment: result += '```\n' else: result += '\n```\n\n' in_code_segment = not in_code_segment elif in_code_segment: result += x.strip() else: for y in re.split(r'(?: *\n *){2,}', x): wrapped = wrapper.fill(re.sub(r'\s+', ' ', y).strip()) if len(wrapped) > 0 and wrapped[0] == '$': result += wrapped[1:] + '\n' wrapper.initial_indent = \ wrapper.subsequent_indent = ' ' * 4 else: if len(wrapped) > 0: result += wrapped + '\n\n' wrapper.initial_indent = wrapper.subsequent_indent = '' return result.rstrip().lstrip('\n') def extract(filename, node, prefix, output): if not (node.location.file is None or os.path.samefile(d(node.location.file.name), filename)): return 0 if node.kind in RECURSE_LIST: sub_prefix = prefix if node.kind not in PREFIX_BLACKLIST: if len(sub_prefix) > 0: sub_prefix += '_' sub_prefix += d(node.spelling) for i in node.get_children(): extract(filename, i, sub_prefix, output) if node.kind in PRINT_LIST: comment = d(node.raw_comment) if node.raw_comment is not None else '' comment = process_comment(comment) sub_prefix = prefix if len(sub_prefix) > 0: sub_prefix += '_' if len(node.spelling) > 0: name = sanitize_name(sub_prefix + d(node.spelling)) output.append((name, filename, comment)) class ExtractionThread(Thread): def __init__(self, filename, parameters, output): Thread.__init__(self) self.filename = filename self.parameters = parameters self.output = output job_semaphore.acquire() def run(self): print('Processing "%s" ..' % self.filename, file=sys.stderr) try: index = cindex.Index( cindex.conf.lib.clang_createIndex(False, True)) tu = index.parse(self.filename, self.parameters) extract(self.filename, tu.cursor, '', self.output) finally: job_semaphore.release() def read_args(args): parameters = [] filenames = [] if "-x" not in args: parameters.extend(['-x', 'c++']) if not any(it.startswith("-std=") for it in args): parameters.append('-std=c++11') if platform.system() == 'Darwin': dev_path = '/Applications/Xcode.app/Contents/Developer/' lib_dir = dev_path + 'Toolchains/XcodeDefault.xctoolchain/usr/lib/' sdk_dir = dev_path + 'Platforms/MacOSX.platform/Developer/SDKs' libclang = lib_dir + 'libclang.dylib' if os.path.exists(libclang): cindex.Config.set_library_path(os.path.dirname(libclang)) if os.path.exists(sdk_dir): sysroot_dir = os.path.join(sdk_dir, next(os.walk(sdk_dir))[1][0]) parameters.append('-isysroot') parameters.append(sysroot_dir) elif platform.system() == 'Linux': # cython.util.find_library does not find `libclang` for all clang # versions and distributions. LLVM switched to a monolithical setup # that includes everything under /usr/lib/llvm{version_number}/ # We therefore glob for the library and select the highest version library_file = sorted(glob("/usr/lib/llvm-*/lib/libclang.so"), reverse=True)[0] cindex.Config.set_library_file(library_file) # clang doesn't find its own base includes by default on Linux, # but different distros install them in different paths. # Try to autodetect, preferring the highest numbered version. def clang_folder_version(d): return [int(ver) for ver in re.findall(r'(?