diff --git a/docutils/HISTORY.rst b/docutils/HISTORY.rst index e02f6e50f..420d8c261 100644 --- a/docutils/HISTORY.rst +++ b/docutils/HISTORY.rst @@ -17,7 +17,17 @@ Release 0.22.3b1.dev (unpublished) ================================== -. +* docutils/parsers/rst/states.py + + - Ignore combining characters when extracting a grid table block + +* docutils/parsers/rst/tableparser.py + + - Ignore combining characters when parsing the grid table structure. + +* docutils/statemachine.py + + - Fix handling of combining characters when extracting 2d-block. Release 0.22.2 (2025-09-20) diff --git a/docutils/RELEASE-NOTES.rst b/docutils/RELEASE-NOTES.rst index 953b8a6ce..e06d923c7 100644 --- a/docutils/RELEASE-NOTES.rst +++ b/docutils/RELEASE-NOTES.rst @@ -266,7 +266,9 @@ Misc Release 0.22.3b1.dev (unpublished) ================================== -. +Rst parser: + Allow for combining characters in grid tables. + Fixes bugs #128 and #512. Release 0.22.2 (2025-09-20) @@ -278,19 +280,11 @@ Remove a spurious vim .swp-file. Release 0.22.1 (2025-09-17) =========================== -* docutils/parsers/rst/states.py - +Rst parser: - Relax "section title" system messages from SEVERE to ERROR. - - Fix behaviour with nested parsing into a detached node - (cf. bugs #508 and #509). - - New attribute `NestedStateMachine.parent_state_machine`. - Use case: update the "current node" of parent state machine(s) - after nested parsing. - - Better error messages for grid table markup errors (bug #504), - based on patch #214 by Jynn Nelson. - -* docutils/writers/latex2e/__init__.py + - New attribute `parsers.rst.states.NestedStateMachine.parent_state_machine`. +LaTeX writer: - Add cross-reference anchors (``\phantomsection\label{...}``) for elements with IDs (fixes bug #503). - Fix cross-reference anchor placement in figures, images, diff --git a/docutils/docutils/parsers/rst/states.py b/docutils/docutils/parsers/rst/states.py index 770a49c51..c28146f04 100644 --- a/docutils/docutils/parsers/rst/states.py +++ b/docutils/docutils/parsers/rst/states.py @@ -116,7 +116,7 @@ from docutils.nodes import fully_normalize_name as normalize_name from docutils.nodes import unescape, whitespace_normalize_name import docutils.parsers.rst from docutils.parsers.rst import directives, languages, tableparser, roles -from docutils.utils import escape2null, column_width +from docutils.utils import escape2null, column_width, strip_combining_chars from docutils.utils import punctuation_chars, urischemes from docutils.utils import split_escaped_whitespace from docutils.utils._roman_numerals import (InvalidRomanNumeralError, @@ -1848,7 +1848,8 @@ class Body(RSTState): messages.extend(self.malformed_table(block, detail, i)) return [], messages, blank_finish for i in range(len(block)): # check right edge - if len(block[i]) != width or block[i][-1] not in '+|': + if len(strip_combining_chars(block[i]) + ) != width or block[i][-1] not in '+|': detail = 'Right border not aligned or missing.' messages.extend(self.malformed_table(block, detail, i)) return [], messages, blank_finish diff --git a/docutils/docutils/parsers/rst/tableparser.py b/docutils/docutils/parsers/rst/tableparser.py index 68d32a56a..ffdb029f1 100644 --- a/docutils/docutils/parsers/rst/tableparser.py +++ b/docutils/docutils/parsers/rst/tableparser.py @@ -167,6 +167,9 @@ class GridTableParser(TableParser): We'll end up knowing all the row and column boundaries, cell positions and their dimensions. """ + # a copy of the block without combining characters: + self.stripped_block = [strip_combining_chars(line) + for line in self.block] corners = [(0, 0)] while corners: top, left = corners.pop(0) @@ -209,7 +212,7 @@ class GridTableParser(TableParser): def scan_cell(self, top, left): """Starting at the top-left corner, start tracing out a cell.""" - assert self.block[top][left] == '+' + assert self.stripped_block[top][left] == '+' return self.scan_right(top, left) def scan_right(self, top, left): @@ -218,7 +221,7 @@ class GridTableParser(TableParser): boundaries ('+'). """ colseps = {} - line = self.block[top] + line = self.stripped_block[top] for i in range(left + 1, self.right + 1): if line[i] == '+': colseps[i] = [top] @@ -238,14 +241,14 @@ class GridTableParser(TableParser): """ rowseps = {} for i in range(top + 1, self.bottom + 1): - if self.block[i][right] == '+': + if self.stripped_block[i][right] == '+': rowseps[i] = [right] result = self.scan_left(top, left, i, right) if result: newrowseps, colseps = result update_dict_of_lists(rowseps, newrowseps) return i, rowseps, colseps - elif self.block[i][right] != '|': + elif self.stripped_block[i][right] != '|': return None return None @@ -255,7 +258,7 @@ class GridTableParser(TableParser): It must line up with the starting point. """ colseps = {} - line = self.block[bottom] + line = self.stripped_block[bottom] for i in range(right - 1, left, -1): if line[i] == '+': colseps[i] = [bottom] @@ -275,9 +278,9 @@ class GridTableParser(TableParser): """ rowseps = {} for i in range(bottom - 1, top, -1): - if self.block[i][left] == '+': + if self.stripped_block[i][left] == '+': rowseps[i] = [left] - elif self.block[i][left] != '|': + elif self.stripped_block[i][left] != '|': return None return rowseps diff --git a/docutils/docutils/statemachine.py b/docutils/docutils/statemachine.py index aab81875d..758182186 100644 --- a/docutils/docutils/statemachine.py +++ b/docutils/docutils/statemachine.py @@ -1426,18 +1426,18 @@ class StringList(ViewList): def get_2D_block(self, top, left, bottom, right, strip_indent=True): block = self[top:bottom] indent = right - for i in range(len(block.data)): - # get slice from line, care for combining characters - ci = utils.column_indices(block.data[i]) + for i, line in enumerate(block.data): + # trim line to block borders, allow for for combining characters + adjusted_indices = utils.column_indices(line) try: - left = ci[left] + left_i = adjusted_indices[left] except IndexError: - left += len(block.data[i]) - len(ci) + left_i = left try: - right = ci[right] + right_i = adjusted_indices[right] except IndexError: - right += len(block.data[i]) - len(ci) - block.data[i] = line = block.data[i][left:right].rstrip() + right_i = len(line) + block.data[i] = line = line[left_i:right_i].rstrip() if line: indent = min(indent, len(line) - len(line.lstrip())) if strip_indent and 0 < indent < right: diff --git a/docutils/test/test_parsers/test_rst/test_TableParser.py b/docutils/test/test_parsers/test_rst/test_TableParser.py index 2ffdfc25f..54ee035af 100755 --- a/docutils/test/test_parsers/test_rst/test_TableParser.py +++ b/docutils/test/test_parsers/test_rst/test_TableParser.py @@ -73,32 +73,32 @@ totest['grid_tables'] = [ [], [[(0, 0, 1, ['A table with']), (0, 0, 1, ['two columns.'])]])], -# Combining chars in grid tables still fail -# ["""\ -# +--------------+------------------+ -# | A tāble w̅ith | comb̲ining chars. | -# +--------------+------------------+ -# """, -# [(0, 0, 2, 15, ['A table with']), -# (0, 15, 2, 30, ['combining chars.'])], -# ([14, 14], -# [], -# [[(0, 0, 1, ['A table with']), -# (0, 0, 1, ['combining chars.'])]])], +# Combining chars in table cells +["""\ ++--------------+------------------+ +| A tāble w̅ith | comb̲ining chars. | ++--------------+------------------+ +""", +[(0, 0, 2, 15, ['A tāble w̅ith']), + (0, 15, 2, 34, ['comb̲ining chars.'])], +([14, 18], + [], + [[(0, 0, 1, ['A tāble w̅ith']), + (0, 0, 1, ['comb̲ining chars.'])]])], ["""\ +--------------+-------------+ -| A table with | two columns | +| A tāble w̅ith | two columns | +--------------+-------------+ | and | two rows. | +--------------+-------------+ """, -[(0, 0, 2, 15, ['A table with']), +[(0, 0, 2, 15, ['A tāble w̅ith']), (0, 15, 2, 29, ['two columns']), (2, 0, 4, 15, ['and']), (2, 15, 4, 29, ['two rows.'])], ([14, 13], [], - [[(0, 0, 1, ['A table with']), + [[(0, 0, 1, ['A tāble w̅ith']), (0, 0, 1, ['two columns'])], [(0, 0, 3, ['and']), (0, 0, 3, ['two rows.'])]])], @@ -126,18 +126,18 @@ totest['grid_tables'] = [ None]])], ["""\ +------------+-------------+---------------+ -| A table | two rows in | and row spans | -| with three +-------------+ to left and | +| A tāble | two rows in | and row spans | +| with t̲h̲r̲e̲e̲ +-------------+ to left and | | columns, | the middle, | right. | +------------+-------------+---------------+ """, -[(0, 0, 4, 13, ['A table', 'with three', 'columns,']), +[(0, 0, 4, 13, ['A tāble', 'with t̲h̲r̲e̲e̲', 'columns,']), (0, 13, 2, 27, ['two rows in']), (0, 27, 4, 43, ['and row spans', 'to left and', 'right.']), (2, 13, 4, 27, ['the middle,'])], ([12, 13, 15], [], - [[(1, 0, 1, ['A table', 'with three', 'columns,']), + [[(1, 0, 1, ['A tāble', 'with t̲h̲r̲e̲e̲', 'columns,']), (0, 0, 1, ['two rows in']), (1, 0, 1, ['and row spans', 'to left and', 'right.'])], [None,