/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
- Copyright (C) 2000-2001, 2004-2005, 2009-2015 Free Software
+ Copyright (C) 2000-2001, 2004-2005, 2009-2016 Free Software
Foundation, Inc.
This file is part of GNU Emacs.
GNU Emacs is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
GNU Emacs is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
/* Copy everything from the start through the active part of
the level stack. */
memcpy (to, from,
- (offsetof (struct bidi_it, level_stack[1])
+ (offsetof (struct bidi_it, level_stack) + sizeof from->level_stack[0]
+ from->stack_idx * sizeof from->level_stack[0]));
}
emacs_abort ();
staticpro (&bidi_brackets_table);
- DEFSYM (Qparagraph_start, "paragraph-start");
- paragraph_start_re = Fsymbol_value (Qparagraph_start);
- if (!STRINGP (paragraph_start_re))
- paragraph_start_re = build_string ("\f\\|[ \t]*$");
+ paragraph_start_re = build_string ("^\\(\f\\|[ \t]*\\)$");
staticpro (¶graph_start_re);
- DEFSYM (Qparagraph_separate, "paragraph-separate");
- paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
- if (!STRINGP (paragraph_separate_re))
- paragraph_separate_re = build_string ("[ \t\f]*$");
+ paragraph_separate_re = build_string ("^[ \t\f]*$");
staticpro (¶graph_separate_re);
bidi_cache_sp = 0;
/* `(space ...)' display specs are handled as paragraph
separators for the purposes of the reordering; see UAX#9
section 3 and clause HL1 in section 4.3 there. */
- ch = 0x2029;
+ ch = PARAGRAPH_SEPARATOR;
}
else
{
/* All other display specs are handled as the Unicode Object
Replacement Character. */
- ch = 0xFFFC;
+ ch = OBJECT_REPLACEMENT_CHARACTER;
}
disp_end_pos = compute_display_string_end (*disp_pos, string);
if (disp_end_pos < 0)
{
eassert (bidi_it->prev.charpos == bidi_it->charpos - 1);
prev_type = bidi_it->prev.orig_type;
- if (prev_type == FSI)
- prev_type = bidi_it->type_after_wn;
}
}
/* Don't move at end of buffer/string. */
emacs_abort ();
bidi_it->bytepos += bidi_it->ch_len;
prev_type = bidi_it->orig_type;
- if (prev_type == FSI)
- prev_type = bidi_it->type_after_wn;
}
else /* EOB or end of string */
prev_type = NEUTRAL_B;
if (typ1 != STRONG_R && typ1 != STRONG_AL)
{
type = LRI;
+ /* Override orig_type, which will be needed when we come to
+ examine the next character, which is the first character
+ inside the isolate. */
+ bidi_it->orig_type = type;
goto fsi_as_lri;
}
else
- type = RLI;
+ {
+ type = RLI;
+ bidi_it->orig_type = type;
+ }
/* FALLTHROUGH */
case RLI: /* X5a */
if (override == NEUTRAL_DIR)
if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
type = WEAK_EN;
}
- else if (bidi_it->next_en_pos >=0)
+ else if (type == WEAK_BN
+ /* This condition is for the following important case:
+
+ . we are at level zero
+ . either previous strong character was L,
+ or we've seen no strong characters since sos
+ and the base paragraph direction is L2R
+ . this BN is NOT a bidi directional control
+
+ For such a situation, either this BN will be
+ converted to EN per W5, and then to L by virtue
+ of W7; or it will become ON per W6, and then L
+ because of N1/N2. So we take a shortcut here
+ and make it L right away, to avoid the
+ potentially costly loop below. This is
+ important when the buffer has a long series of
+ control characters, like binary nulls, and no
+ R2L characters at all. */
+ && new_level == 0
+ && !bidi_explicit_dir_char (bidi_it->ch)
+ && ((bidi_it->last_strong.type == STRONG_L)
+ || (bidi_it->last_strong.type == UNKNOWN_BT
+ && bidi_it->sos == L2R)))
+ type = STRONG_L;
+ else if (bidi_it->next_en_pos >= 0)
{
/* We overstepped the last known position for ET
resolution but there could be other such characters
#define MAX_BPA_STACK ((int)max (MAX_ALLOCA / sizeof (bpa_stack_entry), 1))
/* UAX#9 says to match opening brackets with the matching closing
- brackets or their canonical equivalents. As of Unicode 7.0, there
+ brackets or their canonical equivalents. As of Unicode 8.0, there
are only 2 bracket characters that have canonical equivalence
decompositions: u+2329 and u+232A. So instead of accessing the
table in uni-decomposition.el, we just handle these 2 characters
And finally, cross-reference these two:
- fgrep -w -f brackets.txt decompositions.txt
+ grep -Fw -f brackets.txt decompositions.txt
where "decompositions.txt" was produced by the 1st script, and
- "brackets.txt" by the 2nd script. In the output of fgrep, look
+ "brackets.txt" by the 2nd script. In the output of grep, look
only for decompositions that don't begin with some compatibility
formatting tag, such as "<compat>". Only decompositions that
consist solely of character codepoints are relevant to bidi
#define CANONICAL_EQU(c) \
( ASCII_CHAR_P (c) ? c \
- : (c) == 0x2329 ? 0x3008 \
- : (c) == 0x232a ? 0x3009 \
+ : (c) == LEFT_POINTING_ANGLE_BRACKET ? LEFT_ANGLE_BRACKET \
+ : (c) == RIGHT_POINTING_ANGLE_BRACKET ? RIGHT_ANGLE_BRACKET \
: c )
#ifdef ENABLE_CHECKING
we are already at paragraph end. */
&& (is_neutral || bidi_isolate_fmt_char (type)))
/* N1-N2/Retaining */
- || (type == WEAK_BN && bidi_explicit_dir_char (bidi_it->ch)))
+ || type == WEAK_BN)
{
if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
{
entering the expensive loop in the "else" clause. */
else if (current_level == 0
&& bidi_it->prev_for_neutral.type == STRONG_L
- && !bidi_explicit_dir_char (bidi_it->ch)
- && !bidi_isolate_fmt_char (type))
+ && (ASCII_CHAR_P (bidi_it->ch)
+ || (type != WEAK_BN
+ && !bidi_explicit_dir_char (bidi_it->ch)
+ && !bidi_isolate_fmt_char (type))))
type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
STRONG_L, current_level);
else if (/* current level is 1 */
&& (bidi_it->prev_for_neutral.type == STRONG_R
|| bidi_it->prev_for_neutral.type == WEAK_EN
|| bidi_it->prev_for_neutral.type == WEAK_AN)
+ && type != WEAK_BN
&& !bidi_explicit_dir_char (bidi_it->ch)
&& !bidi_isolate_fmt_char (type))
type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
{
/* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
the assumption of batch-style processing; see clauses W4,
- W5, and especially N1, which require to look far forward
+ W5, and especially N1, which require looking far forward
(as well as back) in the buffer/string. May the fleas of
a thousand camels infest the armpits of those who design
supposedly general-purpose algorithms by looking at their
}
}
- /* Perhaps the character we want is already cached s fully resolved.
+ /* Perhaps the character we want is already cached as fully resolved.
If it is, the call to bidi_cache_find below will return a type
other than UNKNOWN_BT. */
if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
it belongs to a sequence of WS characters preceding a newline
or a TAB or a paragraph separator. */
if ((bidi_it->orig_type == NEUTRAL_WS
+ || bidi_it->orig_type == WEAK_BN
|| bidi_isolate_fmt_char (bidi_it->orig_type))
- && bidi_it->next_for_ws.charpos < bidi_it->charpos)
+ && bidi_it->next_for_ws.charpos < bidi_it->charpos
+ /* If this character is already at base level, we don't need to
+ reset it, so avoid the potentially costly loop below. */
+ && level != bidi_it->level_stack[0].level)
{
int ch;
ptrdiff_t clen = bidi_it->ch_len;
/* Resolve implicit levels. */
if (bidi_it->orig_type == NEUTRAL_B /* L1 */
- || bidi_it->orig_type == NEUTRAL_S
- || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
- || (bidi_it->orig_type == NEUTRAL_WS
- && (bidi_it->next_for_ws.type == NEUTRAL_B
- || bidi_it->next_for_ws.type == NEUTRAL_S)))
+ || bidi_it->orig_type == NEUTRAL_S
+ || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
+ || ((bidi_it->orig_type == NEUTRAL_WS
+ || bidi_it->orig_type == WEAK_BN
+ || bidi_isolate_fmt_char (bidi_it->orig_type)
+ || bidi_explicit_dir_char (bidi_it->ch))
+ && (bidi_it->next_for_ws.type == NEUTRAL_B
+ || bidi_it->next_for_ws.type == NEUTRAL_S)))
level = bidi_it->level_stack[0].level;
else if ((level & 1) == 0) /* I1 */
{
{
int old_level, new_level, next_level;
struct bidi_it sentinel;
- struct gcpro gcpro1;
if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
emacs_abort ();
bidi_it->scan_dir = 1; /* default to logical order */
}
- /* The code below can call eval, and thus cause GC. If we are
- iterating a Lisp string, make sure it won't be GCed. */
- if (STRINGP (bidi_it->string.lstring))
- GCPRO1 (bidi_it->string.lstring);
-
/* If we just passed a newline, initialize for the next line. */
if (!bidi_it->first_elt
&& (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
eassert (bidi_it->resolved_level >= 0
&& bidi_it->resolved_level <= BIDI_MAXDEPTH + 2);
-
- if (STRINGP (bidi_it->string.lstring))
- UNGCPRO;
}
/* Utility function for looking for strong directional characters