Module fpdf.bidi
Expand source code Browse git
# This is an implementation of the Unicode Standard Annex #9
# Unicode bidirectional algorithm - Revision 48 for Unicode 15.1.0
# https://unicode.org/reports/tr9/
import unicodedata
from collections import deque
from dataclasses import dataclass, replace
from operator import itemgetter
from typing import List, Tuple
from .enums import TextDirection
MAX_DEPTH = 125
# BidiBrackets 15.1.0 2023-01-18
# Loaded from https://www.unicode.org/Public/UNIDATA/BidiBrackets.txt
# This table can be dropped when the information is added on "unicodedata"
BIDI_BRACKETS = {
"(": {"pair": ")", "type": "o"},
")": {"pair": "(", "type": "c"},
"[": {"pair": "]", "type": "o"},
"]": {"pair": "[", "type": "c"},
"{": {"pair": "}", "type": "o"},
"}": {"pair": "{", "type": "c"},
"༺": {"pair": "༻", "type": "o"},
"༻": {"pair": "༺", "type": "c"},
"༼": {"pair": "༽", "type": "o"},
"༽": {"pair": "༼", "type": "c"},
"᚛": {"pair": "᚜", "type": "o"},
"᚜": {"pair": "᚛", "type": "c"},
"⁅": {"pair": "⁆", "type": "o"},
"⁆": {"pair": "⁅", "type": "c"},
"⁽": {"pair": "⁾", "type": "o"},
"⁾": {"pair": "⁽", "type": "c"},
"₍": {"pair": "₎", "type": "o"},
"₎": {"pair": "₍", "type": "c"},
"⌈": {"pair": "⌉", "type": "o"},
"⌉": {"pair": "⌈", "type": "c"},
"⌊": {"pair": "⌋", "type": "o"},
"⌋": {"pair": "⌊", "type": "c"},
"〈": {"pair": "〉", "type": "o"},
"〉": {"pair": "〈", "type": "c"},
"❨": {"pair": "❩", "type": "o"},
"❩": {"pair": "❨", "type": "c"},
"❪": {"pair": "❫", "type": "o"},
"❫": {"pair": "❪", "type": "c"},
"❬": {"pair": "❭", "type": "o"},
"❭": {"pair": "❬", "type": "c"},
"❮": {"pair": "❯", "type": "o"},
"❯": {"pair": "❮", "type": "c"},
"❰": {"pair": "❱", "type": "o"},
"❱": {"pair": "❰", "type": "c"},
"❲": {"pair": "❳", "type": "o"},
"❳": {"pair": "❲", "type": "c"},
"❴": {"pair": "❵", "type": "o"},
"❵": {"pair": "❴", "type": "c"},
"⟅": {"pair": "⟆", "type": "o"},
"⟆": {"pair": "⟅", "type": "c"},
"⟦": {"pair": "⟧", "type": "o"},
"⟧": {"pair": "⟦", "type": "c"},
"⟨": {"pair": "⟩", "type": "o"},
"⟩": {"pair": "⟨", "type": "c"},
"⟪": {"pair": "⟫", "type": "o"},
"⟫": {"pair": "⟪", "type": "c"},
"⟬": {"pair": "⟭", "type": "o"},
"⟭": {"pair": "⟬", "type": "c"},
"⟮": {"pair": "⟯", "type": "o"},
"⟯": {"pair": "⟮", "type": "c"},
"⦃": {"pair": "⦄", "type": "o"},
"⦄": {"pair": "⦃", "type": "c"},
"⦅": {"pair": "⦆", "type": "o"},
"⦆": {"pair": "⦅", "type": "c"},
"⦇": {"pair": "⦈", "type": "o"},
"⦈": {"pair": "⦇", "type": "c"},
"⦉": {"pair": "⦊", "type": "o"},
"⦊": {"pair": "⦉", "type": "c"},
"⦋": {"pair": "⦌", "type": "o"},
"⦌": {"pair": "⦋", "type": "c"},
"⦍": {"pair": "⦐", "type": "o"},
"⦎": {"pair": "⦏", "type": "c"},
"⦏": {"pair": "⦎", "type": "o"},
"⦐": {"pair": "⦍", "type": "c"},
"⦑": {"pair": "⦒", "type": "o"},
"⦒": {"pair": "⦑", "type": "c"},
"⦓": {"pair": "⦔", "type": "o"},
"⦔": {"pair": "⦓", "type": "c"},
"⦕": {"pair": "⦖", "type": "o"},
"⦖": {"pair": "⦕", "type": "c"},
"⦗": {"pair": "⦘", "type": "o"},
"⦘": {"pair": "⦗", "type": "c"},
"⧘": {"pair": "⧙", "type": "o"},
"⧙": {"pair": "⧘", "type": "c"},
"⧚": {"pair": "⧛", "type": "o"},
"⧛": {"pair": "⧚", "type": "c"},
"⧼": {"pair": "⧽", "type": "o"},
"⧽": {"pair": "⧼", "type": "c"},
"⸢": {"pair": "⸣", "type": "o"},
"⸣": {"pair": "⸢", "type": "c"},
"⸤": {"pair": "⸥", "type": "o"},
"⸥": {"pair": "⸤", "type": "c"},
"⸦": {"pair": "⸧", "type": "o"},
"⸧": {"pair": "⸦", "type": "c"},
"⸨": {"pair": "⸩", "type": "o"},
"⸩": {"pair": "⸨", "type": "c"},
"⹕": {"pair": "⹖", "type": "o"},
"⹖": {"pair": "⹕", "type": "c"},
"⹗": {"pair": "⹘", "type": "o"},
"⹘": {"pair": "⹗", "type": "c"},
"⹙": {"pair": "⹚", "type": "o"},
"⹚": {"pair": "⹙", "type": "c"},
"⹛": {"pair": "⹜", "type": "o"},
"⹜": {"pair": "⹛", "type": "c"},
"〈": {"pair": "〉", "type": "o"},
"〉": {"pair": "〈", "type": "c"},
"《": {"pair": "》", "type": "o"},
"》": {"pair": "《", "type": "c"},
"「": {"pair": "」", "type": "o"},
"」": {"pair": "「", "type": "c"},
"『": {"pair": "』", "type": "o"},
"』": {"pair": "『", "type": "c"},
"【": {"pair": "】", "type": "o"},
"】": {"pair": "【", "type": "c"},
"〔": {"pair": "〕", "type": "o"},
"〕": {"pair": "〔", "type": "c"},
"〖": {"pair": "〗", "type": "o"},
"〗": {"pair": "〖", "type": "c"},
"〘": {"pair": "〙", "type": "o"},
"〙": {"pair": "〘", "type": "c"},
"〚": {"pair": "〛", "type": "o"},
"〛": {"pair": "〚", "type": "c"},
"﹙": {"pair": "﹚", "type": "o"},
"﹚": {"pair": "﹙", "type": "c"},
"﹛": {"pair": "﹜", "type": "o"},
"﹜": {"pair": "﹛", "type": "c"},
"﹝": {"pair": "﹞", "type": "o"},
"﹞": {"pair": "﹝", "type": "c"},
"(": {"pair": ")", "type": "o"},
")": {"pair": "(", "type": "c"},
"[": {"pair": "]", "type": "o"},
"]": {"pair": "[", "type": "c"},
"{": {"pair": "}", "type": "o"},
"}": {"pair": "{", "type": "c"},
"⦅": {"pair": "⦆", "type": "o"},
"⦆": {"pair": "⦅", "type": "c"},
"「": {"pair": "」", "type": "o"},
"」": {"pair": "「", "type": "c"},
}
class BidiCharacter:
__slots__ = [
"character_index",
"character",
"bidi_class",
"original_bidi_class",
"embedding_level",
"direction",
]
def __init__(
self, character_index: int, character: str, embedding_level: str, debug: bool
):
self.character_index = character_index
self.character = character
if debug and character.isupper():
self.bidi_class = "R"
else:
self.bidi_class = unicodedata.bidirectional(character)
self.original_bidi_class = self.bidi_class
self.embedding_level = embedding_level
self.direction = None
def get_direction_from_level(self):
return "R" if self.embedding_level % 2 else "L"
def set_class(self, cls):
self.bidi_class = cls
def __repr__(self):
return (
f"character_index: {self.character_index} character: {self.character}"
+ f" bidi_class: {self.bidi_class} original_bidi_class: {self.original_bidi_class}"
+ f" embedding_level: {self.embedding_level} direction: {self.direction}"
)
@dataclass
class DirectionalStatus:
__slots__ = [
"embedding_level",
"directional_override_status",
"directional_isolate_status",
]
embedding_level: int # between 0 and MAX_DEPTH
directional_override_status: str # "N" (Neutral), "L" (Left) or "R" (Right)
directional_isolate_status: bool
class IsolatingRun:
__slots__ = ["characters", "previous_direction", "next_direction"]
def __init__(self, characters: List[BidiCharacter], sos: str, eos: str):
self.characters = characters
self.previous_direction = sos
self.next_direction = eos
self.resolve_weak_types()
self.resolve_neutral_types()
self.resolve_implicit_levels()
def resolve_weak_types(self) -> None:
# W1. Examine each nonspacing mark (NSM) in the isolating run sequence, and change the type of the NSM to Other Neutral
# if the previous character is an isolate initiator or PDI, and to the type of the previous character otherwise.
# If the NSM is at the start of the isolating run sequence, it will get the type of sos.
for i, bidi_char in enumerate(self.characters):
if bidi_char.bidi_class == "NSM":
if i == 0:
bidi_char.set_class(self.previous_direction)
else:
bidi_char.set_class(
"ON"
if self.characters[i - 1].bidi_class
in ("LRI", "RLI", "FSI", "PDI")
else self.characters[i - 1].bidi_class
)
# W2. Search backward from each instance of a European number until the first strong type (R, L, AL, or sos) is found.
# If an AL is found, change the type of the European number to Arabic number.
# W3. Change all ALs to R.
last_strong_type = self.previous_direction
for bidi_char in self.characters:
if bidi_char.bidi_class in ("R", "L", "AL"):
last_strong_type = bidi_char.bidi_class
if bidi_char.bidi_class == "AL":
bidi_char.set_class("R")
if bidi_char.bidi_class == "EN" and last_strong_type == "AL":
bidi_char.set_class("AN")
# W4. A single European separator between two European numbers changes to a European number.
# A single common separator between two numbers of the same type changes to that type.
for i, bidi_char in enumerate(self.characters):
if i in (0, len(self.characters) - 1):
continue
if (
bidi_char.bidi_class == "ES"
and self.characters[i - 1].bidi_class == "EN"
and self.characters[i + 1].bidi_class == "EN"
):
bidi_char.set_class("EN")
if (
bidi_char.bidi_class == "CS"
and self.characters[i - 1].bidi_class in ("AN", "EN")
and self.characters[i + 1].bidi_class
== self.characters[i - 1].bidi_class
):
bidi_char.set_class(self.characters[i - 1].bidi_class)
# W5. A sequence of European terminators adjacent to European numbers changes to all European numbers.
# W6. All remaining separators and terminators (after the application of W4 and W5) change to Other Neutral.
def prev_is_en(i: int) -> bool:
if i == 0:
return False
if self.characters[i - 1].bidi_class == "ET":
return prev_is_en(i - 1)
return self.characters[i - 1].bidi_class == "EN"
def next_is_en(i: int) -> bool:
if i == len(self.characters) - 1:
return False
if self.characters[i + 1].bidi_class == "ET":
return next_is_en(i + 1)
return self.characters[i + 1].bidi_class == "EN"
for i, bidi_char in enumerate(self.characters):
if bidi_char.bidi_class == "ET":
if prev_is_en(i) or next_is_en(i):
bidi_char.set_class("EN")
if bidi_char.bidi_class in ("ET", "ES", "CS"):
bidi_char.set_class("ON")
# W7. Search backward from each instance of a European number until the first strong type (R, L, or sos) is found.
# If an L is found, then change the type of the European number to L.
last_strong_type = self.previous_direction
for bidi_char in self.characters:
if bidi_char.bidi_class in ("R", "L", "AL"):
last_strong_type = bidi_char.bidi_class
if bidi_char.bidi_class == "EN" and last_strong_type == "L":
bidi_char.set_class("L")
def pair_brackets(self) -> List[Tuple[int, int]]:
"""
Calculate all the bracket pairs on an isolate run, to be used on rule N0
How to calculate bracket pairs:
- Basic definitions 14, 15 and 16: http://www.unicode.org/reports/tr9/#BD14
- BIDI brackets for dummies: https://www.unicode.org/notes/tn39/
"""
open_brackets = []
open_bracket_count = 0
bracket_pairs = []
for index, char in enumerate(self.characters):
if char.character in BIDI_BRACKETS and char.bidi_class == "ON":
if BIDI_BRACKETS[char.character]["type"] == "o":
if open_bracket_count >= 63:
return []
open_brackets.append((char.character, index))
open_bracket_count += 1
if BIDI_BRACKETS[char.character]["type"] == "c":
if open_bracket_count == 0:
continue
for current_open_bracket in range(open_bracket_count, 0, -1):
open_char, open_index = open_brackets[current_open_bracket - 1]
if (BIDI_BRACKETS[open_char]["pair"] == char.character) or (
BIDI_BRACKETS[open_char]["pair"] in ("〉", "〉")
and char.character in ("〉", "〉")
):
bracket_pairs.append((open_index, index))
open_brackets = open_brackets[: current_open_bracket - 1]
open_bracket_count = current_open_bracket - 1
break
return sorted(bracket_pairs, key=itemgetter(0))
def resolve_neutral_types(self) -> None:
def previous_strong(index: int):
if index == 0:
return self.previous_direction
if self.characters[index - 1].bidi_class == "L":
return "L"
if self.characters[index - 1].bidi_class in ("R", "AN", "EN"):
return "R"
return previous_strong(index - 1)
def next_strong(index: int):
if index >= len(self.characters) - 1:
return self.next_direction
if self.characters[index + 1].bidi_class == "L":
return "L"
if self.characters[index + 1].bidi_class in ("R", "AN", "EN"):
return "R"
return next_strong(index + 1)
# N0-N2: Resolving neutral types
# N0
brackets = self.pair_brackets()
if brackets:
embedding_direction = self.characters[0].get_direction_from_level()
for b in brackets:
strong_same_direction = False
strong_opposite_direction = False
resulting_direction = None
for index in range(b[0], b[1]):
if (
self.characters[index].bidi_class == "L"
and embedding_direction == "L"
) or (
self.characters[index].bidi_class in ("R", "AN", "EN")
and embedding_direction == "R"
):
strong_same_direction = True
break
if (
self.characters[index].bidi_class == "L"
and embedding_direction == "R"
) or (
self.characters[index].bidi_class in ("R", "AN", "EN")
and embedding_direction == "L"
):
strong_opposite_direction = True
if strong_same_direction:
resulting_direction = embedding_direction
elif strong_opposite_direction:
opposite_direction = "L" if embedding_direction == "R" else "R"
if previous_strong(b[0]) == opposite_direction:
resulting_direction = opposite_direction
else:
resulting_direction = embedding_direction
if resulting_direction:
self.characters[b[0]].bidi_class = resulting_direction
self.characters[b[1]].bidi_class = resulting_direction
if len(self.characters) > b[1] + 1:
next_char = self.characters[b[1] + 1]
if (
next_char.original_bidi_class == "NSM"
and next_char.bidi_class == "ON"
):
next_char.bidi_class = resulting_direction
for i, bidi_char in enumerate(self.characters):
# N1-N2
if bidi_char.bidi_class in (
"B",
"S",
"WS",
"ON",
"FSI",
"LRI",
"RLI",
"PDI",
):
if previous_strong(i) == next_strong(i):
bidi_char.bidi_class = previous_strong(i)
else:
bidi_char.bidi_class = bidi_char.get_direction_from_level()
def resolve_implicit_levels(self) -> None:
for bidi_char in self.characters:
# I1. For all characters with an even (left-to-right) embedding level,
# those of type R go up one level and those of type AN or EN go up two levels.
if bidi_char.embedding_level % 2 == 0:
if bidi_char.bidi_class == "R":
bidi_char.embedding_level += 1
if bidi_char.bidi_class in ("AN", "EN"):
bidi_char.embedding_level += 2
# I2. For all characters with an odd (right-to-left) embedding level, those of type L, EN or AN go up one level.
else:
if bidi_char.bidi_class in ("L", "EN", "AN"):
bidi_char.embedding_level += 1
def auto_detect_base_direction(
string: str, stop_at_pdi: bool = False, debug: bool = False
) -> TextDirection:
"""
This function applies rules P2 and P3 to detect the direction of a paragraph, retuning
the first strong direction and skipping over isolate sequences.
P1 must be applied before calling this function (breaking into paragraphs)
stop_at_pdi can be set to True to get the direction of a single isolate sequence
"""
# Auto-LTR (standard BIDI) uses the first L/R/AL character, and is LTR if none is found.
isolate = 0
for char in string:
bidi_class = unicodedata.bidirectional(char)
if debug and bidi_class.isupper():
bidi_class = "R"
if bidi_class == "PDI" and isolate == 0 and stop_at_pdi:
return TextDirection.LTR
if bidi_class in ("LRI", "RLI", "FSI"):
isolate += 1
if bidi_class == "PDI" and isolate > 0:
isolate -= 1
if bidi_class in ("R", "AL") and isolate == 0:
return TextDirection.RTL
if bidi_class == "L" and isolate == 0:
return TextDirection.LTR
return TextDirection.LTR
def calculate_isolate_runs(paragraph: List[BidiCharacter]) -> List[IsolatingRun]:
# BD13 and X10
level_run = []
lr = []
lr_embedding_level = paragraph[0].embedding_level
for bidi_char in paragraph:
if bidi_char.embedding_level != lr_embedding_level:
level_run.append(
{"level": lr_embedding_level, "text": lr, "complete": False}
)
lr = []
lr_embedding_level = bidi_char.embedding_level
lr.append(bidi_char)
level_run.append({"level": lr_embedding_level, "text": lr, "complete": False})
def level_to_direction(level: int) -> str:
if level % 2 == 0:
return "L"
return "R"
# compute sos, eos for each level run
for index, lr in enumerate(level_run):
if lr["complete"]:
continue
if index == 0:
sos = level_to_direction(lr["level"])
else:
sos = level_to_direction(max(lr["level"], level_run[index - 1]["level"]))
if index == len(level_run) - 1:
eos = level_to_direction(lr["level"])
else:
if lr["text"][-1].original_bidi_class in ("LRI", "RLI", "FSI"):
# X10 - last char is an isolator without matching PDI - set EOS to embedding level
eos = level_to_direction(lr["level"])
else:
eos = level_to_direction(
max(lr["level"], level_run[index + 1]["level"])
)
lr["sos"] = sos
lr["eos"] = eos
# combine levels runs to create isolate runs
isolate_runs = []
for index, lr in enumerate(level_run):
if lr["complete"]:
continue
sos = lr["sos"]
eos = lr["eos"]
ir_chars = lr["text"]
lr["complete"] = True
if lr["text"][-1].original_bidi_class in ("LRI", "RLI", "FSI"):
for nlr in level_run[index + 1 :]:
if (
nlr["level"] == lr["level"]
and nlr["text"][0].original_bidi_class == "PDI"
):
lr["text"] += nlr["text"]
nlr["complete"] = True
eos = nlr["eos"]
if nlr["text"][-1].original_bidi_class not in ("LRI", "RLI", "FSI"):
break
isolate_runs.append(IsolatingRun(characters=ir_chars, sos=sos, eos=eos))
return isolate_runs
class BidiParagraph:
__slots__ = (
"text",
"base_direction",
"debug",
"base_embedding_level",
"characters",
)
def __init__(
self, text: str, base_direction: TextDirection = None, debug: bool = False
):
self.text = text
self.base_direction = (
auto_detect_base_direction(self.text, debug)
if not base_direction
else base_direction
)
self.debug = debug
self.base_embedding_level = (
0 if self.base_direction == TextDirection.LTR else 1
) # base level
self.characters: List[BidiCharacter] = []
self.get_bidi_characters()
def get_characters(self) -> List[BidiCharacter]:
return self.characters
def get_characters_with_embedding_level(self) -> List[BidiCharacter]:
# Calculate embedding level for each character after breaking isolating runs.
# Only used on conformance testing
self.reorder_resolved_levels()
return self.characters
def get_reordered_characters(self) -> List[BidiCharacter]:
return self.reorder_resolved_levels()
def get_all(self):
return self.characters, self.reorder_resolved_levels()
def get_reordered_string(self):
"Used for conformance validation"
return "".join(c.character for c in self.reorder_resolved_levels())
def get_bidi_fragments(self):
return self.split_bidi_fragments()
def get_bidi_characters(self) -> List[BidiCharacter]:
# Explicit leves and directions. Rule X1
stack: List[DirectionalStatus] = deque()
current_status = DirectionalStatus(
embedding_level=self.base_embedding_level,
directional_override_status="N",
directional_isolate_status=False,
)
stack.append(replace(current_status))
overflow_isolate_count = 0
overflow_embedding_count = 0
valid_isolate_count = 0
results = []
# Explicit embeddings. Process each character individually applying rules X2 through X8
for index, char in enumerate(self.text):
bidi_char = BidiCharacter(
index, char, current_status.embedding_level, self.debug
)
new_bidi_class = None
if bidi_char.bidi_class == "FSI":
bidi_char.bidi_class = (
"LRI"
if auto_detect_base_direction(
self.text[index + 1 :], stop_at_pdi=True, debug=self.debug
)
== TextDirection.LTR
else "RLI"
)
if bidi_char.bidi_class in ("RLE", "LRE", "RLO", "LRO", "RLI", "LRI"):
# X2 - X5: calculate explicit embeddings and explicit overrides
if bidi_char.bidi_class[0] == "R":
new_embedding_level = (
current_status.embedding_level + 1
) | 1 # least greater odd
else:
new_embedding_level = (
current_status.embedding_level + 2
) & ~1 # least greater even
if (
bidi_char.bidi_class[2] == "I"
and current_status.directional_override_status != "N"
):
new_bidi_class = current_status.directional_override_status
if (
new_embedding_level <= MAX_DEPTH
and overflow_isolate_count == 0
and overflow_embedding_count == 0
):
current_status.embedding_level = new_embedding_level
current_status.directional_override_status = (
bidi_char.bidi_class[0]
if bidi_char.bidi_class[2] == "O"
else "N"
)
if bidi_char.bidi_class[2] == "I":
valid_isolate_count += 1
current_status.directional_isolate_status = True
else:
current_status.directional_isolate_status = False
stack.append(replace(current_status))
else:
if bidi_char.bidi_class[2] == "I":
overflow_isolate_count += 1
else:
if overflow_isolate_count == 0:
overflow_embedding_count += 1
if bidi_char.bidi_class not in (
"B",
"BN",
"RLE",
"LRE",
"RLO",
"LRO",
"PDF",
"FSI",
"PDI",
): # X6
if current_status.directional_override_status != "N":
new_bidi_class = current_status.directional_override_status
if bidi_char.bidi_class == "PDI": # X6a
if overflow_isolate_count > 0:
overflow_isolate_count -= 1
elif valid_isolate_count > 0:
overflow_embedding_count = 0
while True:
if not stack[-1].directional_isolate_status:
stack.pop()
continue
break
stack.pop()
current_status = replace(stack[-1])
valid_isolate_count -= 1
assert isinstance(current_status, DirectionalStatus)
bidi_char.embedding_level = current_status.embedding_level
if current_status.directional_override_status != "N":
new_bidi_class = current_status.directional_override_status
if bidi_char.bidi_class == "PDF": # X7
if overflow_isolate_count == 0:
if overflow_embedding_count > 0:
overflow_embedding_count -= 1
else:
if (
not current_status.directional_isolate_status
and len(stack) > 1
):
stack.pop()
current_status = replace(stack[-1])
if new_bidi_class:
bidi_char.bidi_class = new_bidi_class
if bidi_char.bidi_class not in (
"RLE",
"LRE",
"RLO",
"LRO",
"PDF",
"BN",
): # X9
if bidi_char.bidi_class == "B":
bidi_char.embedding_level = self.base_embedding_level
elif bidi_char.original_bidi_class not in ("LRI", "RLI", "FSI"):
bidi_char.embedding_level = current_status.embedding_level
results.append(bidi_char)
if not results:
self.characters = []
return
self.characters = results
calculate_isolate_runs(results)
def split_bidi_fragments(self):
bidi_fragments = []
if len(self.characters) == 0:
return ()
current_fragment = ""
current_direction = ""
for c in self.characters:
if c.get_direction_from_level() != current_direction:
if current_fragment:
bidi_fragments.append(
(
current_fragment,
(
TextDirection.RTL
if current_direction == "R"
else TextDirection.LTR
),
)
)
current_fragment = ""
current_direction = c.get_direction_from_level()
current_fragment += c.character
if current_fragment:
bidi_fragments.append(
(
current_fragment,
(
TextDirection.RTL
if current_direction == "R"
else TextDirection.LTR
),
)
)
return tuple(bidi_fragments)
def reorder_resolved_levels(self):
before_separator = True
end_of_line = True
max_level = 0
min_odd_level = 999
for bidi_char in reversed(self.characters):
# Rule L1. Reset the embedding level of segment separators, paragraph separators,
# and any adjacent whitespace.
if bidi_char.original_bidi_class in ("S", "B"):
bidi_char.embedding_level = self.base_embedding_level
before_separator = True
elif bidi_char.original_bidi_class in (
"BN",
"WS",
"FSI",
"LRI",
"RLI",
"PDI",
):
if before_separator or end_of_line:
bidi_char.embedding_level = self.base_embedding_level
else:
before_separator = False
end_of_line = False
if bidi_char.embedding_level > max_level:
max_level = bidi_char.embedding_level
if (
bidi_char.embedding_level % 2 != 0
and bidi_char.embedding_level < min_odd_level
):
min_odd_level = bidi_char.embedding_level
# Rule L2. From the highest level found in the text to the lowest odd level on each line,
# reverse any contiguous sequence of characters that are at that level or higher.
reordered_paragraph = self.characters.copy()
for level in range(max_level, min_odd_level - 1, -1):
temp_results = []
rev = []
for bidi_char in reordered_paragraph:
if bidi_char.embedding_level >= level:
rev.append(bidi_char)
else:
if rev:
rev.reverse()
temp_results += rev
rev = []
temp_results.append(bidi_char)
if rev:
rev.reverse()
temp_results += rev
reordered_paragraph = temp_results
return tuple(reordered_paragraph)
Functions
def auto_detect_base_direction(string: str, stop_at_pdi: bool = False, debug: bool = False) ‑> TextDirection
-
This function applies rules P2 and P3 to detect the direction of a paragraph, retuning the first strong direction and skipping over isolate sequences. P1 must be applied before calling this function (breaking into paragraphs) stop_at_pdi can be set to True to get the direction of a single isolate sequence
Expand source code Browse git
def auto_detect_base_direction( string: str, stop_at_pdi: bool = False, debug: bool = False ) -> TextDirection: """ This function applies rules P2 and P3 to detect the direction of a paragraph, retuning the first strong direction and skipping over isolate sequences. P1 must be applied before calling this function (breaking into paragraphs) stop_at_pdi can be set to True to get the direction of a single isolate sequence """ # Auto-LTR (standard BIDI) uses the first L/R/AL character, and is LTR if none is found. isolate = 0 for char in string: bidi_class = unicodedata.bidirectional(char) if debug and bidi_class.isupper(): bidi_class = "R" if bidi_class == "PDI" and isolate == 0 and stop_at_pdi: return TextDirection.LTR if bidi_class in ("LRI", "RLI", "FSI"): isolate += 1 if bidi_class == "PDI" and isolate > 0: isolate -= 1 if bidi_class in ("R", "AL") and isolate == 0: return TextDirection.RTL if bidi_class == "L" and isolate == 0: return TextDirection.LTR return TextDirection.LTR
def calculate_isolate_runs(paragraph: List[BidiCharacter]) ‑> List[IsolatingRun]
-
Expand source code Browse git
def calculate_isolate_runs(paragraph: List[BidiCharacter]) -> List[IsolatingRun]: # BD13 and X10 level_run = [] lr = [] lr_embedding_level = paragraph[0].embedding_level for bidi_char in paragraph: if bidi_char.embedding_level != lr_embedding_level: level_run.append( {"level": lr_embedding_level, "text": lr, "complete": False} ) lr = [] lr_embedding_level = bidi_char.embedding_level lr.append(bidi_char) level_run.append({"level": lr_embedding_level, "text": lr, "complete": False}) def level_to_direction(level: int) -> str: if level % 2 == 0: return "L" return "R" # compute sos, eos for each level run for index, lr in enumerate(level_run): if lr["complete"]: continue if index == 0: sos = level_to_direction(lr["level"]) else: sos = level_to_direction(max(lr["level"], level_run[index - 1]["level"])) if index == len(level_run) - 1: eos = level_to_direction(lr["level"]) else: if lr["text"][-1].original_bidi_class in ("LRI", "RLI", "FSI"): # X10 - last char is an isolator without matching PDI - set EOS to embedding level eos = level_to_direction(lr["level"]) else: eos = level_to_direction( max(lr["level"], level_run[index + 1]["level"]) ) lr["sos"] = sos lr["eos"] = eos # combine levels runs to create isolate runs isolate_runs = [] for index, lr in enumerate(level_run): if lr["complete"]: continue sos = lr["sos"] eos = lr["eos"] ir_chars = lr["text"] lr["complete"] = True if lr["text"][-1].original_bidi_class in ("LRI", "RLI", "FSI"): for nlr in level_run[index + 1 :]: if ( nlr["level"] == lr["level"] and nlr["text"][0].original_bidi_class == "PDI" ): lr["text"] += nlr["text"] nlr["complete"] = True eos = nlr["eos"] if nlr["text"][-1].original_bidi_class not in ("LRI", "RLI", "FSI"): break isolate_runs.append(IsolatingRun(characters=ir_chars, sos=sos, eos=eos)) return isolate_runs
Classes
class BidiCharacter (character_index: int, character: str, embedding_level: str, debug: bool)
-
Expand source code Browse git
class BidiCharacter: __slots__ = [ "character_index", "character", "bidi_class", "original_bidi_class", "embedding_level", "direction", ] def __init__( self, character_index: int, character: str, embedding_level: str, debug: bool ): self.character_index = character_index self.character = character if debug and character.isupper(): self.bidi_class = "R" else: self.bidi_class = unicodedata.bidirectional(character) self.original_bidi_class = self.bidi_class self.embedding_level = embedding_level self.direction = None def get_direction_from_level(self): return "R" if self.embedding_level % 2 else "L" def set_class(self, cls): self.bidi_class = cls def __repr__(self): return ( f"character_index: {self.character_index} character: {self.character}" + f" bidi_class: {self.bidi_class} original_bidi_class: {self.original_bidi_class}" + f" embedding_level: {self.embedding_level} direction: {self.direction}" )
Instance variables
var bidi_class
-
Return an attribute of instance, which is of type owner.
var character
-
Return an attribute of instance, which is of type owner.
var character_index
-
Return an attribute of instance, which is of type owner.
var direction
-
Return an attribute of instance, which is of type owner.
var embedding_level
-
Return an attribute of instance, which is of type owner.
var original_bidi_class
-
Return an attribute of instance, which is of type owner.
Methods
def get_direction_from_level(self)
-
Expand source code Browse git
def get_direction_from_level(self): return "R" if self.embedding_level % 2 else "L"
def set_class(self, cls)
-
Expand source code Browse git
def set_class(self, cls): self.bidi_class = cls
class BidiParagraph (text: str, base_direction: TextDirection = None, debug: bool = False)
-
Expand source code Browse git
class BidiParagraph: __slots__ = ( "text", "base_direction", "debug", "base_embedding_level", "characters", ) def __init__( self, text: str, base_direction: TextDirection = None, debug: bool = False ): self.text = text self.base_direction = ( auto_detect_base_direction(self.text, debug) if not base_direction else base_direction ) self.debug = debug self.base_embedding_level = ( 0 if self.base_direction == TextDirection.LTR else 1 ) # base level self.characters: List[BidiCharacter] = [] self.get_bidi_characters() def get_characters(self) -> List[BidiCharacter]: return self.characters def get_characters_with_embedding_level(self) -> List[BidiCharacter]: # Calculate embedding level for each character after breaking isolating runs. # Only used on conformance testing self.reorder_resolved_levels() return self.characters def get_reordered_characters(self) -> List[BidiCharacter]: return self.reorder_resolved_levels() def get_all(self): return self.characters, self.reorder_resolved_levels() def get_reordered_string(self): "Used for conformance validation" return "".join(c.character for c in self.reorder_resolved_levels()) def get_bidi_fragments(self): return self.split_bidi_fragments() def get_bidi_characters(self) -> List[BidiCharacter]: # Explicit leves and directions. Rule X1 stack: List[DirectionalStatus] = deque() current_status = DirectionalStatus( embedding_level=self.base_embedding_level, directional_override_status="N", directional_isolate_status=False, ) stack.append(replace(current_status)) overflow_isolate_count = 0 overflow_embedding_count = 0 valid_isolate_count = 0 results = [] # Explicit embeddings. Process each character individually applying rules X2 through X8 for index, char in enumerate(self.text): bidi_char = BidiCharacter( index, char, current_status.embedding_level, self.debug ) new_bidi_class = None if bidi_char.bidi_class == "FSI": bidi_char.bidi_class = ( "LRI" if auto_detect_base_direction( self.text[index + 1 :], stop_at_pdi=True, debug=self.debug ) == TextDirection.LTR else "RLI" ) if bidi_char.bidi_class in ("RLE", "LRE", "RLO", "LRO", "RLI", "LRI"): # X2 - X5: calculate explicit embeddings and explicit overrides if bidi_char.bidi_class[0] == "R": new_embedding_level = ( current_status.embedding_level + 1 ) | 1 # least greater odd else: new_embedding_level = ( current_status.embedding_level + 2 ) & ~1 # least greater even if ( bidi_char.bidi_class[2] == "I" and current_status.directional_override_status != "N" ): new_bidi_class = current_status.directional_override_status if ( new_embedding_level <= MAX_DEPTH and overflow_isolate_count == 0 and overflow_embedding_count == 0 ): current_status.embedding_level = new_embedding_level current_status.directional_override_status = ( bidi_char.bidi_class[0] if bidi_char.bidi_class[2] == "O" else "N" ) if bidi_char.bidi_class[2] == "I": valid_isolate_count += 1 current_status.directional_isolate_status = True else: current_status.directional_isolate_status = False stack.append(replace(current_status)) else: if bidi_char.bidi_class[2] == "I": overflow_isolate_count += 1 else: if overflow_isolate_count == 0: overflow_embedding_count += 1 if bidi_char.bidi_class not in ( "B", "BN", "RLE", "LRE", "RLO", "LRO", "PDF", "FSI", "PDI", ): # X6 if current_status.directional_override_status != "N": new_bidi_class = current_status.directional_override_status if bidi_char.bidi_class == "PDI": # X6a if overflow_isolate_count > 0: overflow_isolate_count -= 1 elif valid_isolate_count > 0: overflow_embedding_count = 0 while True: if not stack[-1].directional_isolate_status: stack.pop() continue break stack.pop() current_status = replace(stack[-1]) valid_isolate_count -= 1 assert isinstance(current_status, DirectionalStatus) bidi_char.embedding_level = current_status.embedding_level if current_status.directional_override_status != "N": new_bidi_class = current_status.directional_override_status if bidi_char.bidi_class == "PDF": # X7 if overflow_isolate_count == 0: if overflow_embedding_count > 0: overflow_embedding_count -= 1 else: if ( not current_status.directional_isolate_status and len(stack) > 1 ): stack.pop() current_status = replace(stack[-1]) if new_bidi_class: bidi_char.bidi_class = new_bidi_class if bidi_char.bidi_class not in ( "RLE", "LRE", "RLO", "LRO", "PDF", "BN", ): # X9 if bidi_char.bidi_class == "B": bidi_char.embedding_level = self.base_embedding_level elif bidi_char.original_bidi_class not in ("LRI", "RLI", "FSI"): bidi_char.embedding_level = current_status.embedding_level results.append(bidi_char) if not results: self.characters = [] return self.characters = results calculate_isolate_runs(results) def split_bidi_fragments(self): bidi_fragments = [] if len(self.characters) == 0: return () current_fragment = "" current_direction = "" for c in self.characters: if c.get_direction_from_level() != current_direction: if current_fragment: bidi_fragments.append( ( current_fragment, ( TextDirection.RTL if current_direction == "R" else TextDirection.LTR ), ) ) current_fragment = "" current_direction = c.get_direction_from_level() current_fragment += c.character if current_fragment: bidi_fragments.append( ( current_fragment, ( TextDirection.RTL if current_direction == "R" else TextDirection.LTR ), ) ) return tuple(bidi_fragments) def reorder_resolved_levels(self): before_separator = True end_of_line = True max_level = 0 min_odd_level = 999 for bidi_char in reversed(self.characters): # Rule L1. Reset the embedding level of segment separators, paragraph separators, # and any adjacent whitespace. if bidi_char.original_bidi_class in ("S", "B"): bidi_char.embedding_level = self.base_embedding_level before_separator = True elif bidi_char.original_bidi_class in ( "BN", "WS", "FSI", "LRI", "RLI", "PDI", ): if before_separator or end_of_line: bidi_char.embedding_level = self.base_embedding_level else: before_separator = False end_of_line = False if bidi_char.embedding_level > max_level: max_level = bidi_char.embedding_level if ( bidi_char.embedding_level % 2 != 0 and bidi_char.embedding_level < min_odd_level ): min_odd_level = bidi_char.embedding_level # Rule L2. From the highest level found in the text to the lowest odd level on each line, # reverse any contiguous sequence of characters that are at that level or higher. reordered_paragraph = self.characters.copy() for level in range(max_level, min_odd_level - 1, -1): temp_results = [] rev = [] for bidi_char in reordered_paragraph: if bidi_char.embedding_level >= level: rev.append(bidi_char) else: if rev: rev.reverse() temp_results += rev rev = [] temp_results.append(bidi_char) if rev: rev.reverse() temp_results += rev reordered_paragraph = temp_results return tuple(reordered_paragraph)
Instance variables
var base_direction
-
Return an attribute of instance, which is of type owner.
var base_embedding_level
-
Return an attribute of instance, which is of type owner.
var characters
-
Return an attribute of instance, which is of type owner.
var debug
-
Return an attribute of instance, which is of type owner.
var text
-
Return an attribute of instance, which is of type owner.
Methods
def get_all(self)
-
Expand source code Browse git
def get_all(self): return self.characters, self.reorder_resolved_levels()
def get_bidi_characters(self) ‑> List[BidiCharacter]
-
Expand source code Browse git
def get_bidi_characters(self) -> List[BidiCharacter]: # Explicit leves and directions. Rule X1 stack: List[DirectionalStatus] = deque() current_status = DirectionalStatus( embedding_level=self.base_embedding_level, directional_override_status="N", directional_isolate_status=False, ) stack.append(replace(current_status)) overflow_isolate_count = 0 overflow_embedding_count = 0 valid_isolate_count = 0 results = [] # Explicit embeddings. Process each character individually applying rules X2 through X8 for index, char in enumerate(self.text): bidi_char = BidiCharacter( index, char, current_status.embedding_level, self.debug ) new_bidi_class = None if bidi_char.bidi_class == "FSI": bidi_char.bidi_class = ( "LRI" if auto_detect_base_direction( self.text[index + 1 :], stop_at_pdi=True, debug=self.debug ) == TextDirection.LTR else "RLI" ) if bidi_char.bidi_class in ("RLE", "LRE", "RLO", "LRO", "RLI", "LRI"): # X2 - X5: calculate explicit embeddings and explicit overrides if bidi_char.bidi_class[0] == "R": new_embedding_level = ( current_status.embedding_level + 1 ) | 1 # least greater odd else: new_embedding_level = ( current_status.embedding_level + 2 ) & ~1 # least greater even if ( bidi_char.bidi_class[2] == "I" and current_status.directional_override_status != "N" ): new_bidi_class = current_status.directional_override_status if ( new_embedding_level <= MAX_DEPTH and overflow_isolate_count == 0 and overflow_embedding_count == 0 ): current_status.embedding_level = new_embedding_level current_status.directional_override_status = ( bidi_char.bidi_class[0] if bidi_char.bidi_class[2] == "O" else "N" ) if bidi_char.bidi_class[2] == "I": valid_isolate_count += 1 current_status.directional_isolate_status = True else: current_status.directional_isolate_status = False stack.append(replace(current_status)) else: if bidi_char.bidi_class[2] == "I": overflow_isolate_count += 1 else: if overflow_isolate_count == 0: overflow_embedding_count += 1 if bidi_char.bidi_class not in ( "B", "BN", "RLE", "LRE", "RLO", "LRO", "PDF", "FSI", "PDI", ): # X6 if current_status.directional_override_status != "N": new_bidi_class = current_status.directional_override_status if bidi_char.bidi_class == "PDI": # X6a if overflow_isolate_count > 0: overflow_isolate_count -= 1 elif valid_isolate_count > 0: overflow_embedding_count = 0 while True: if not stack[-1].directional_isolate_status: stack.pop() continue break stack.pop() current_status = replace(stack[-1]) valid_isolate_count -= 1 assert isinstance(current_status, DirectionalStatus) bidi_char.embedding_level = current_status.embedding_level if current_status.directional_override_status != "N": new_bidi_class = current_status.directional_override_status if bidi_char.bidi_class == "PDF": # X7 if overflow_isolate_count == 0: if overflow_embedding_count > 0: overflow_embedding_count -= 1 else: if ( not current_status.directional_isolate_status and len(stack) > 1 ): stack.pop() current_status = replace(stack[-1]) if new_bidi_class: bidi_char.bidi_class = new_bidi_class if bidi_char.bidi_class not in ( "RLE", "LRE", "RLO", "LRO", "PDF", "BN", ): # X9 if bidi_char.bidi_class == "B": bidi_char.embedding_level = self.base_embedding_level elif bidi_char.original_bidi_class not in ("LRI", "RLI", "FSI"): bidi_char.embedding_level = current_status.embedding_level results.append(bidi_char) if not results: self.characters = [] return self.characters = results calculate_isolate_runs(results)
def get_bidi_fragments(self)
-
Expand source code Browse git
def get_bidi_fragments(self): return self.split_bidi_fragments()
def get_characters(self) ‑> List[BidiCharacter]
-
Expand source code Browse git
def get_characters(self) -> List[BidiCharacter]: return self.characters
def get_characters_with_embedding_level(self) ‑> List[BidiCharacter]
-
Expand source code Browse git
def get_characters_with_embedding_level(self) -> List[BidiCharacter]: # Calculate embedding level for each character after breaking isolating runs. # Only used on conformance testing self.reorder_resolved_levels() return self.characters
def get_reordered_characters(self) ‑> List[BidiCharacter]
-
Expand source code Browse git
def get_reordered_characters(self) -> List[BidiCharacter]: return self.reorder_resolved_levels()
def get_reordered_string(self)
-
Used for conformance validation
Expand source code Browse git
def get_reordered_string(self): "Used for conformance validation" return "".join(c.character for c in self.reorder_resolved_levels())
def reorder_resolved_levels(self)
-
Expand source code Browse git
def reorder_resolved_levels(self): before_separator = True end_of_line = True max_level = 0 min_odd_level = 999 for bidi_char in reversed(self.characters): # Rule L1. Reset the embedding level of segment separators, paragraph separators, # and any adjacent whitespace. if bidi_char.original_bidi_class in ("S", "B"): bidi_char.embedding_level = self.base_embedding_level before_separator = True elif bidi_char.original_bidi_class in ( "BN", "WS", "FSI", "LRI", "RLI", "PDI", ): if before_separator or end_of_line: bidi_char.embedding_level = self.base_embedding_level else: before_separator = False end_of_line = False if bidi_char.embedding_level > max_level: max_level = bidi_char.embedding_level if ( bidi_char.embedding_level % 2 != 0 and bidi_char.embedding_level < min_odd_level ): min_odd_level = bidi_char.embedding_level # Rule L2. From the highest level found in the text to the lowest odd level on each line, # reverse any contiguous sequence of characters that are at that level or higher. reordered_paragraph = self.characters.copy() for level in range(max_level, min_odd_level - 1, -1): temp_results = [] rev = [] for bidi_char in reordered_paragraph: if bidi_char.embedding_level >= level: rev.append(bidi_char) else: if rev: rev.reverse() temp_results += rev rev = [] temp_results.append(bidi_char) if rev: rev.reverse() temp_results += rev reordered_paragraph = temp_results return tuple(reordered_paragraph)
def split_bidi_fragments(self)
-
Expand source code Browse git
def split_bidi_fragments(self): bidi_fragments = [] if len(self.characters) == 0: return () current_fragment = "" current_direction = "" for c in self.characters: if c.get_direction_from_level() != current_direction: if current_fragment: bidi_fragments.append( ( current_fragment, ( TextDirection.RTL if current_direction == "R" else TextDirection.LTR ), ) ) current_fragment = "" current_direction = c.get_direction_from_level() current_fragment += c.character if current_fragment: bidi_fragments.append( ( current_fragment, ( TextDirection.RTL if current_direction == "R" else TextDirection.LTR ), ) ) return tuple(bidi_fragments)
class DirectionalStatus (embedding_level: int, directional_override_status: str, directional_isolate_status: bool)
-
DirectionalStatus(embedding_level: int, directional_override_status: str, directional_isolate_status: bool)
Expand source code Browse git
@dataclass class DirectionalStatus: __slots__ = [ "embedding_level", "directional_override_status", "directional_isolate_status", ] embedding_level: int # between 0 and MAX_DEPTH directional_override_status: str # "N" (Neutral), "L" (Left) or "R" (Right) directional_isolate_status: bool
Instance variables
var directional_isolate_status : bool
-
Return an attribute of instance, which is of type owner.
var directional_override_status : str
-
Return an attribute of instance, which is of type owner.
var embedding_level : int
-
Return an attribute of instance, which is of type owner.
class IsolatingRun (characters: List[BidiCharacter], sos: str, eos: str)
-
Expand source code Browse git
class IsolatingRun: __slots__ = ["characters", "previous_direction", "next_direction"] def __init__(self, characters: List[BidiCharacter], sos: str, eos: str): self.characters = characters self.previous_direction = sos self.next_direction = eos self.resolve_weak_types() self.resolve_neutral_types() self.resolve_implicit_levels() def resolve_weak_types(self) -> None: # W1. Examine each nonspacing mark (NSM) in the isolating run sequence, and change the type of the NSM to Other Neutral # if the previous character is an isolate initiator or PDI, and to the type of the previous character otherwise. # If the NSM is at the start of the isolating run sequence, it will get the type of sos. for i, bidi_char in enumerate(self.characters): if bidi_char.bidi_class == "NSM": if i == 0: bidi_char.set_class(self.previous_direction) else: bidi_char.set_class( "ON" if self.characters[i - 1].bidi_class in ("LRI", "RLI", "FSI", "PDI") else self.characters[i - 1].bidi_class ) # W2. Search backward from each instance of a European number until the first strong type (R, L, AL, or sos) is found. # If an AL is found, change the type of the European number to Arabic number. # W3. Change all ALs to R. last_strong_type = self.previous_direction for bidi_char in self.characters: if bidi_char.bidi_class in ("R", "L", "AL"): last_strong_type = bidi_char.bidi_class if bidi_char.bidi_class == "AL": bidi_char.set_class("R") if bidi_char.bidi_class == "EN" and last_strong_type == "AL": bidi_char.set_class("AN") # W4. A single European separator between two European numbers changes to a European number. # A single common separator between two numbers of the same type changes to that type. for i, bidi_char in enumerate(self.characters): if i in (0, len(self.characters) - 1): continue if ( bidi_char.bidi_class == "ES" and self.characters[i - 1].bidi_class == "EN" and self.characters[i + 1].bidi_class == "EN" ): bidi_char.set_class("EN") if ( bidi_char.bidi_class == "CS" and self.characters[i - 1].bidi_class in ("AN", "EN") and self.characters[i + 1].bidi_class == self.characters[i - 1].bidi_class ): bidi_char.set_class(self.characters[i - 1].bidi_class) # W5. A sequence of European terminators adjacent to European numbers changes to all European numbers. # W6. All remaining separators and terminators (after the application of W4 and W5) change to Other Neutral. def prev_is_en(i: int) -> bool: if i == 0: return False if self.characters[i - 1].bidi_class == "ET": return prev_is_en(i - 1) return self.characters[i - 1].bidi_class == "EN" def next_is_en(i: int) -> bool: if i == len(self.characters) - 1: return False if self.characters[i + 1].bidi_class == "ET": return next_is_en(i + 1) return self.characters[i + 1].bidi_class == "EN" for i, bidi_char in enumerate(self.characters): if bidi_char.bidi_class == "ET": if prev_is_en(i) or next_is_en(i): bidi_char.set_class("EN") if bidi_char.bidi_class in ("ET", "ES", "CS"): bidi_char.set_class("ON") # W7. Search backward from each instance of a European number until the first strong type (R, L, or sos) is found. # If an L is found, then change the type of the European number to L. last_strong_type = self.previous_direction for bidi_char in self.characters: if bidi_char.bidi_class in ("R", "L", "AL"): last_strong_type = bidi_char.bidi_class if bidi_char.bidi_class == "EN" and last_strong_type == "L": bidi_char.set_class("L") def pair_brackets(self) -> List[Tuple[int, int]]: """ Calculate all the bracket pairs on an isolate run, to be used on rule N0 How to calculate bracket pairs: - Basic definitions 14, 15 and 16: http://www.unicode.org/reports/tr9/#BD14 - BIDI brackets for dummies: https://www.unicode.org/notes/tn39/ """ open_brackets = [] open_bracket_count = 0 bracket_pairs = [] for index, char in enumerate(self.characters): if char.character in BIDI_BRACKETS and char.bidi_class == "ON": if BIDI_BRACKETS[char.character]["type"] == "o": if open_bracket_count >= 63: return [] open_brackets.append((char.character, index)) open_bracket_count += 1 if BIDI_BRACKETS[char.character]["type"] == "c": if open_bracket_count == 0: continue for current_open_bracket in range(open_bracket_count, 0, -1): open_char, open_index = open_brackets[current_open_bracket - 1] if (BIDI_BRACKETS[open_char]["pair"] == char.character) or ( BIDI_BRACKETS[open_char]["pair"] in ("〉", "〉") and char.character in ("〉", "〉") ): bracket_pairs.append((open_index, index)) open_brackets = open_brackets[: current_open_bracket - 1] open_bracket_count = current_open_bracket - 1 break return sorted(bracket_pairs, key=itemgetter(0)) def resolve_neutral_types(self) -> None: def previous_strong(index: int): if index == 0: return self.previous_direction if self.characters[index - 1].bidi_class == "L": return "L" if self.characters[index - 1].bidi_class in ("R", "AN", "EN"): return "R" return previous_strong(index - 1) def next_strong(index: int): if index >= len(self.characters) - 1: return self.next_direction if self.characters[index + 1].bidi_class == "L": return "L" if self.characters[index + 1].bidi_class in ("R", "AN", "EN"): return "R" return next_strong(index + 1) # N0-N2: Resolving neutral types # N0 brackets = self.pair_brackets() if brackets: embedding_direction = self.characters[0].get_direction_from_level() for b in brackets: strong_same_direction = False strong_opposite_direction = False resulting_direction = None for index in range(b[0], b[1]): if ( self.characters[index].bidi_class == "L" and embedding_direction == "L" ) or ( self.characters[index].bidi_class in ("R", "AN", "EN") and embedding_direction == "R" ): strong_same_direction = True break if ( self.characters[index].bidi_class == "L" and embedding_direction == "R" ) or ( self.characters[index].bidi_class in ("R", "AN", "EN") and embedding_direction == "L" ): strong_opposite_direction = True if strong_same_direction: resulting_direction = embedding_direction elif strong_opposite_direction: opposite_direction = "L" if embedding_direction == "R" else "R" if previous_strong(b[0]) == opposite_direction: resulting_direction = opposite_direction else: resulting_direction = embedding_direction if resulting_direction: self.characters[b[0]].bidi_class = resulting_direction self.characters[b[1]].bidi_class = resulting_direction if len(self.characters) > b[1] + 1: next_char = self.characters[b[1] + 1] if ( next_char.original_bidi_class == "NSM" and next_char.bidi_class == "ON" ): next_char.bidi_class = resulting_direction for i, bidi_char in enumerate(self.characters): # N1-N2 if bidi_char.bidi_class in ( "B", "S", "WS", "ON", "FSI", "LRI", "RLI", "PDI", ): if previous_strong(i) == next_strong(i): bidi_char.bidi_class = previous_strong(i) else: bidi_char.bidi_class = bidi_char.get_direction_from_level() def resolve_implicit_levels(self) -> None: for bidi_char in self.characters: # I1. For all characters with an even (left-to-right) embedding level, # those of type R go up one level and those of type AN or EN go up two levels. if bidi_char.embedding_level % 2 == 0: if bidi_char.bidi_class == "R": bidi_char.embedding_level += 1 if bidi_char.bidi_class in ("AN", "EN"): bidi_char.embedding_level += 2 # I2. For all characters with an odd (right-to-left) embedding level, those of type L, EN or AN go up one level. else: if bidi_char.bidi_class in ("L", "EN", "AN"): bidi_char.embedding_level += 1
Instance variables
var characters
-
Return an attribute of instance, which is of type owner.
var next_direction
-
Return an attribute of instance, which is of type owner.
var previous_direction
-
Return an attribute of instance, which is of type owner.
Methods
def pair_brackets(self) ‑> List[Tuple[int, int]]
-
Calculate all the bracket pairs on an isolate run, to be used on rule N0 How to calculate bracket pairs: - Basic definitions 14, 15 and 16: http://www.unicode.org/reports/tr9/#BD14 - BIDI brackets for dummies: https://www.unicode.org/notes/tn39/
Expand source code Browse git
def pair_brackets(self) -> List[Tuple[int, int]]: """ Calculate all the bracket pairs on an isolate run, to be used on rule N0 How to calculate bracket pairs: - Basic definitions 14, 15 and 16: http://www.unicode.org/reports/tr9/#BD14 - BIDI brackets for dummies: https://www.unicode.org/notes/tn39/ """ open_brackets = [] open_bracket_count = 0 bracket_pairs = [] for index, char in enumerate(self.characters): if char.character in BIDI_BRACKETS and char.bidi_class == "ON": if BIDI_BRACKETS[char.character]["type"] == "o": if open_bracket_count >= 63: return [] open_brackets.append((char.character, index)) open_bracket_count += 1 if BIDI_BRACKETS[char.character]["type"] == "c": if open_bracket_count == 0: continue for current_open_bracket in range(open_bracket_count, 0, -1): open_char, open_index = open_brackets[current_open_bracket - 1] if (BIDI_BRACKETS[open_char]["pair"] == char.character) or ( BIDI_BRACKETS[open_char]["pair"] in ("〉", "〉") and char.character in ("〉", "〉") ): bracket_pairs.append((open_index, index)) open_brackets = open_brackets[: current_open_bracket - 1] open_bracket_count = current_open_bracket - 1 break return sorted(bracket_pairs, key=itemgetter(0))
def resolve_implicit_levels(self) ‑> None
-
Expand source code Browse git
def resolve_implicit_levels(self) -> None: for bidi_char in self.characters: # I1. For all characters with an even (left-to-right) embedding level, # those of type R go up one level and those of type AN or EN go up two levels. if bidi_char.embedding_level % 2 == 0: if bidi_char.bidi_class == "R": bidi_char.embedding_level += 1 if bidi_char.bidi_class in ("AN", "EN"): bidi_char.embedding_level += 2 # I2. For all characters with an odd (right-to-left) embedding level, those of type L, EN or AN go up one level. else: if bidi_char.bidi_class in ("L", "EN", "AN"): bidi_char.embedding_level += 1
def resolve_neutral_types(self) ‑> None
-
Expand source code Browse git
def resolve_neutral_types(self) -> None: def previous_strong(index: int): if index == 0: return self.previous_direction if self.characters[index - 1].bidi_class == "L": return "L" if self.characters[index - 1].bidi_class in ("R", "AN", "EN"): return "R" return previous_strong(index - 1) def next_strong(index: int): if index >= len(self.characters) - 1: return self.next_direction if self.characters[index + 1].bidi_class == "L": return "L" if self.characters[index + 1].bidi_class in ("R", "AN", "EN"): return "R" return next_strong(index + 1) # N0-N2: Resolving neutral types # N0 brackets = self.pair_brackets() if brackets: embedding_direction = self.characters[0].get_direction_from_level() for b in brackets: strong_same_direction = False strong_opposite_direction = False resulting_direction = None for index in range(b[0], b[1]): if ( self.characters[index].bidi_class == "L" and embedding_direction == "L" ) or ( self.characters[index].bidi_class in ("R", "AN", "EN") and embedding_direction == "R" ): strong_same_direction = True break if ( self.characters[index].bidi_class == "L" and embedding_direction == "R" ) or ( self.characters[index].bidi_class in ("R", "AN", "EN") and embedding_direction == "L" ): strong_opposite_direction = True if strong_same_direction: resulting_direction = embedding_direction elif strong_opposite_direction: opposite_direction = "L" if embedding_direction == "R" else "R" if previous_strong(b[0]) == opposite_direction: resulting_direction = opposite_direction else: resulting_direction = embedding_direction if resulting_direction: self.characters[b[0]].bidi_class = resulting_direction self.characters[b[1]].bidi_class = resulting_direction if len(self.characters) > b[1] + 1: next_char = self.characters[b[1] + 1] if ( next_char.original_bidi_class == "NSM" and next_char.bidi_class == "ON" ): next_char.bidi_class = resulting_direction for i, bidi_char in enumerate(self.characters): # N1-N2 if bidi_char.bidi_class in ( "B", "S", "WS", "ON", "FSI", "LRI", "RLI", "PDI", ): if previous_strong(i) == next_strong(i): bidi_char.bidi_class = previous_strong(i) else: bidi_char.bidi_class = bidi_char.get_direction_from_level()
def resolve_weak_types(self) ‑> None
-
Expand source code Browse git
def resolve_weak_types(self) -> None: # W1. Examine each nonspacing mark (NSM) in the isolating run sequence, and change the type of the NSM to Other Neutral # if the previous character is an isolate initiator or PDI, and to the type of the previous character otherwise. # If the NSM is at the start of the isolating run sequence, it will get the type of sos. for i, bidi_char in enumerate(self.characters): if bidi_char.bidi_class == "NSM": if i == 0: bidi_char.set_class(self.previous_direction) else: bidi_char.set_class( "ON" if self.characters[i - 1].bidi_class in ("LRI", "RLI", "FSI", "PDI") else self.characters[i - 1].bidi_class ) # W2. Search backward from each instance of a European number until the first strong type (R, L, AL, or sos) is found. # If an AL is found, change the type of the European number to Arabic number. # W3. Change all ALs to R. last_strong_type = self.previous_direction for bidi_char in self.characters: if bidi_char.bidi_class in ("R", "L", "AL"): last_strong_type = bidi_char.bidi_class if bidi_char.bidi_class == "AL": bidi_char.set_class("R") if bidi_char.bidi_class == "EN" and last_strong_type == "AL": bidi_char.set_class("AN") # W4. A single European separator between two European numbers changes to a European number. # A single common separator between two numbers of the same type changes to that type. for i, bidi_char in enumerate(self.characters): if i in (0, len(self.characters) - 1): continue if ( bidi_char.bidi_class == "ES" and self.characters[i - 1].bidi_class == "EN" and self.characters[i + 1].bidi_class == "EN" ): bidi_char.set_class("EN") if ( bidi_char.bidi_class == "CS" and self.characters[i - 1].bidi_class in ("AN", "EN") and self.characters[i + 1].bidi_class == self.characters[i - 1].bidi_class ): bidi_char.set_class(self.characters[i - 1].bidi_class) # W5. A sequence of European terminators adjacent to European numbers changes to all European numbers. # W6. All remaining separators and terminators (after the application of W4 and W5) change to Other Neutral. def prev_is_en(i: int) -> bool: if i == 0: return False if self.characters[i - 1].bidi_class == "ET": return prev_is_en(i - 1) return self.characters[i - 1].bidi_class == "EN" def next_is_en(i: int) -> bool: if i == len(self.characters) - 1: return False if self.characters[i + 1].bidi_class == "ET": return next_is_en(i + 1) return self.characters[i + 1].bidi_class == "EN" for i, bidi_char in enumerate(self.characters): if bidi_char.bidi_class == "ET": if prev_is_en(i) or next_is_en(i): bidi_char.set_class("EN") if bidi_char.bidi_class in ("ET", "ES", "CS"): bidi_char.set_class("ON") # W7. Search backward from each instance of a European number until the first strong type (R, L, or sos) is found. # If an L is found, then change the type of the European number to L. last_strong_type = self.previous_direction for bidi_char in self.characters: if bidi_char.bidi_class in ("R", "L", "AL"): last_strong_type = bidi_char.bidi_class if bidi_char.bidi_class == "EN" and last_strong_type == "L": bidi_char.set_class("L")