rST parser: allow for combining characters in grid tables.

Ignore combining characters when extracting a grid table block and
when parsing the grid table structure.
Allow for combining characters when extracting 2d-block with cell content.

Missing part of the fixes in [r7231].

Fixes [bugs:#128] and [bugs:#512].

git-svn-id: http://svn.code.sf.net/p/docutils/code/trunk@10251 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
This commit is contained in:
milde
2025-09-22 21:00:13 +00:00
parent 3acf13898c
commit 3435086033
6 changed files with 57 additions and 49 deletions

View File

@@ -17,7 +17,17 @@
Release 0.22.3b1.dev (unpublished)
==================================
.
* docutils/parsers/rst/states.py
- Ignore combining characters when extracting a grid table block
* docutils/parsers/rst/tableparser.py
- Ignore combining characters when parsing the grid table structure.
* docutils/statemachine.py
- Fix handling of combining characters when extracting 2d-block.
Release 0.22.2 (2025-09-20)

View File

@@ -266,7 +266,9 @@ Misc
Release 0.22.3b1.dev (unpublished)
==================================
.
Rst parser:
Allow for combining characters in grid tables.
Fixes bugs #128 and #512.
Release 0.22.2 (2025-09-20)
@@ -278,19 +280,11 @@ Remove a spurious vim .swp-file.
Release 0.22.1 (2025-09-17)
===========================
* docutils/parsers/rst/states.py
Rst parser:
- Relax "section title" system messages from SEVERE to ERROR.
- Fix behaviour with nested parsing into a detached node
(cf. bugs #508 and #509).
- New attribute `NestedStateMachine.parent_state_machine`.
Use case: update the "current node" of parent state machine(s)
after nested parsing.
- Better error messages for grid table markup errors (bug #504),
based on patch #214 by Jynn Nelson.
* docutils/writers/latex2e/__init__.py
- New attribute `parsers.rst.states.NestedStateMachine.parent_state_machine`.
LaTeX writer:
- Add cross-reference anchors (``\phantomsection\label{...}``)
for elements with IDs (fixes bug #503).
- Fix cross-reference anchor placement in figures, images,

View File

@@ -116,7 +116,7 @@ from docutils.nodes import fully_normalize_name as normalize_name
from docutils.nodes import unescape, whitespace_normalize_name
import docutils.parsers.rst
from docutils.parsers.rst import directives, languages, tableparser, roles
from docutils.utils import escape2null, column_width
from docutils.utils import escape2null, column_width, strip_combining_chars
from docutils.utils import punctuation_chars, urischemes
from docutils.utils import split_escaped_whitespace
from docutils.utils._roman_numerals import (InvalidRomanNumeralError,
@@ -1848,7 +1848,8 @@ class Body(RSTState):
messages.extend(self.malformed_table(block, detail, i))
return [], messages, blank_finish
for i in range(len(block)): # check right edge
if len(block[i]) != width or block[i][-1] not in '+|':
if len(strip_combining_chars(block[i])
) != width or block[i][-1] not in '+|':
detail = 'Right border not aligned or missing.'
messages.extend(self.malformed_table(block, detail, i))
return [], messages, blank_finish

View File

@@ -167,6 +167,9 @@ class GridTableParser(TableParser):
We'll end up knowing all the row and column boundaries, cell positions
and their dimensions.
"""
# a copy of the block without combining characters:
self.stripped_block = [strip_combining_chars(line)
for line in self.block]
corners = [(0, 0)]
while corners:
top, left = corners.pop(0)
@@ -209,7 +212,7 @@ class GridTableParser(TableParser):
def scan_cell(self, top, left):
"""Starting at the top-left corner, start tracing out a cell."""
assert self.block[top][left] == '+'
assert self.stripped_block[top][left] == '+'
return self.scan_right(top, left)
def scan_right(self, top, left):
@@ -218,7 +221,7 @@ class GridTableParser(TableParser):
boundaries ('+').
"""
colseps = {}
line = self.block[top]
line = self.stripped_block[top]
for i in range(left + 1, self.right + 1):
if line[i] == '+':
colseps[i] = [top]
@@ -238,14 +241,14 @@ class GridTableParser(TableParser):
"""
rowseps = {}
for i in range(top + 1, self.bottom + 1):
if self.block[i][right] == '+':
if self.stripped_block[i][right] == '+':
rowseps[i] = [right]
result = self.scan_left(top, left, i, right)
if result:
newrowseps, colseps = result
update_dict_of_lists(rowseps, newrowseps)
return i, rowseps, colseps
elif self.block[i][right] != '|':
elif self.stripped_block[i][right] != '|':
return None
return None
@@ -255,7 +258,7 @@ class GridTableParser(TableParser):
It must line up with the starting point.
"""
colseps = {}
line = self.block[bottom]
line = self.stripped_block[bottom]
for i in range(right - 1, left, -1):
if line[i] == '+':
colseps[i] = [bottom]
@@ -275,9 +278,9 @@ class GridTableParser(TableParser):
"""
rowseps = {}
for i in range(bottom - 1, top, -1):
if self.block[i][left] == '+':
if self.stripped_block[i][left] == '+':
rowseps[i] = [left]
elif self.block[i][left] != '|':
elif self.stripped_block[i][left] != '|':
return None
return rowseps

View File

@@ -1426,18 +1426,18 @@ class StringList(ViewList):
def get_2D_block(self, top, left, bottom, right, strip_indent=True):
block = self[top:bottom]
indent = right
for i in range(len(block.data)):
# get slice from line, care for combining characters
ci = utils.column_indices(block.data[i])
for i, line in enumerate(block.data):
# trim line to block borders, allow for for combining characters
adjusted_indices = utils.column_indices(line)
try:
left = ci[left]
left_i = adjusted_indices[left]
except IndexError:
left += len(block.data[i]) - len(ci)
left_i = left
try:
right = ci[right]
right_i = adjusted_indices[right]
except IndexError:
right += len(block.data[i]) - len(ci)
block.data[i] = line = block.data[i][left:right].rstrip()
right_i = len(line)
block.data[i] = line = line[left_i:right_i].rstrip()
if line:
indent = min(indent, len(line) - len(line.lstrip()))
if strip_indent and 0 < indent < right:

View File

@@ -73,32 +73,32 @@ totest['grid_tables'] = [
[],
[[(0, 0, 1, ['A table with']),
(0, 0, 1, ['two columns.'])]])],
# Combining chars in grid tables still fail
# ["""\
# +--------------+------------------+
# | A tāble w̅ith | comb̲ining chars. |
# +--------------+------------------+
# """,
# [(0, 0, 2, 15, ['A table with']),
# (0, 15, 2, 30, ['combining chars.'])],
# ([14, 14],
# [],
# [[(0, 0, 1, ['A table with']),
# (0, 0, 1, ['combining chars.'])]])],
# Combining chars in table cells
["""\
+--------------+------------------+
| A tāble w̅ith | comb̲ining chars. |
+--------------+------------------+
""",
[(0, 0, 2, 15, ['A tāble w̅ith']),
(0, 15, 2, 34, ['comb̲ining chars.'])],
([14, 18],
[],
[[(0, 0, 1, ['A tāble w̅ith']),
(0, 0, 1, ['comb̲ining chars.'])]])],
["""\
+--------------+-------------+
| A table with | two columns |
| A tāble w̅ith | two columns |
+--------------+-------------+
| and | two rows. |
+--------------+-------------+
""",
[(0, 0, 2, 15, ['A table with']),
[(0, 0, 2, 15, ['A tāble w̅ith']),
(0, 15, 2, 29, ['two columns']),
(2, 0, 4, 15, ['and']),
(2, 15, 4, 29, ['two rows.'])],
([14, 13],
[],
[[(0, 0, 1, ['A table with']),
[[(0, 0, 1, ['A tāble w̅ith']),
(0, 0, 1, ['two columns'])],
[(0, 0, 3, ['and']),
(0, 0, 3, ['two rows.'])]])],
@@ -126,18 +126,18 @@ totest['grid_tables'] = [
None]])],
["""\
+------------+-------------+---------------+
| A table | two rows in | and row spans |
| with three +-------------+ to left and |
| A tāble | two rows in | and row spans |
| with t̲h̲r̲e̲e̲ +-------------+ to left and |
| columns, | the middle, | right. |
+------------+-------------+---------------+
""",
[(0, 0, 4, 13, ['A table', 'with three', 'columns,']),
[(0, 0, 4, 13, ['A tāble', 'with t̲h̲r̲e̲e̲', 'columns,']),
(0, 13, 2, 27, ['two rows in']),
(0, 27, 4, 43, ['and row spans', 'to left and', 'right.']),
(2, 13, 4, 27, ['the middle,'])],
([12, 13, 15],
[],
[[(1, 0, 1, ['A table', 'with three', 'columns,']),
[[(1, 0, 1, ['A tāble', 'with t̲h̲r̲e̲e̲', 'columns,']),
(0, 0, 1, ['two rows in']),
(1, 0, 1, ['and row spans', 'to left and', 'right.'])],
[None,