661f8c0b920f5da Mon Sep 17 00:00:00 2001 From: Kristian Rietveld Date: Tue, 19 Mar 2013 11:23:49 +0100 Subject: [PATCH 2/2] Detect and handle characters encoded in two UTF16 code points Another important change: gi->index should point at the current character, not the current into the string. Before this change, the current character equaled the current index into the string. --- modules/basic/basic-coretext.c | 55 ++++++++++++++++++++++++++++----------- 1 files changed, 39 insertions(+), 16 deletions(-) diff --git a/modules/basic/basic-coretext.c b/modules/basic/basic-coretext.c index 33ce479..06b648e 100644 --- a/modules/basic/basic-coretext.c +++ b/modules/basic/basic-coretext.c @@ -166,7 +166,42 @@ run_iterator_run_is_non_monotonic (struct RunIterator *iter) static gunichar run_iterator_get_character (struct RunIterator *iter) { - return CFStringGetCharacterAtIndex (iter->cstr, iter->current_indices[iter->ct_i]); + int lower, upper; + + lower = iter->current_indices[iter->ct_i]; + if (iter->ct_i + 1 < CTRunGetGlyphCount (iter->current_run)) + upper = iter->current_indices[iter->ct_i + 1]; + else + { + CFRange range = CTRunGetStringRange (iter->current_run); + upper = range.location + range.length; + } + + if (upper - lower == 1) + return CFStringGetCharacterAtIndex (iter->cstr, lower); + if (upper - lower == 2) + { + /* Character is encoded in two UTF16 code points. */ + gunichar *ch; + gunichar retval; + gunichar2 orig[2]; + + orig[0] = CFStringGetCharacterAtIndex (iter->cstr, lower); + orig[1] = CFStringGetCharacterAtIndex (iter->cstr, lower + 1); + + ch = g_utf16_to_ucs4 (orig, 2, NULL, NULL, NULL); + retval = *ch; + g_free (ch); + + return retval; + } + + /* This should not be reached, because other cases cannot occur. Instead + * of crashing, return the first character which will likely be displayed + * as unknown glyph. + */ + + return CFStringGetCharacterAtIndex (iter->cstr, lower); } static CGGlyph @@ -175,12 +210,6 @@ run_iterator_get_cgglyph (struct RunIterator *iter) return iter->current_cgglyphs[iter->ct_i]; } -static CFIndex -run_iterator_get_index (struct RunIterator *iter) -{ - return iter->current_indices[iter->ct_i]; -} - static gboolean run_iterator_create (struct RunIterator *iter, const char *text, @@ -336,7 +365,7 @@ create_core_text_glyph_list (const char *text, struct GlyphInfo *gi; gi = g_slice_new (struct GlyphInfo); - gi->index = run_iterator_get_index (&riter); + gi->index = riter.total_ct_i; gi->cgglyph = run_iterator_get_cgglyph (&riter); gi->wc = run_iterator_get_character (&riter); @@ -376,9 +405,8 @@ basic_engine_shape (PangoEngineShape *engine, * glyph sequence generated by the CoreText typesetter: * # E.g. zero-width spaces do not end up in the CoreText glyph sequence. We have * to manually account for the gap in the character indices. - * # Sometimes, CoreText generates two glyph for the same character index. We - * currently handle this "properly" as in we do not crash or corrupt memory, - * but that's about it. + * # Sometimes, CoreText generates two glyph for the same character index. These + * are properly composed into a single 32-bit gunichar. * # Due to mismatches in size, the CoreText glyph sequence can either be longer or * shorter than the PangoGlyphString. Note that the size of the PangoGlyphString * should match the number of characters in "text". @@ -390,11 +418,6 @@ basic_engine_shape (PangoEngineShape *engine, * increasing/decreasing. * * FIXME items for future fixing: - * # CoreText strings are UTF16, and the indices *often* refer to characters, - * but not *always*. Notable exception is when a character is encoded using - * two UTF16 code points. This are two characters in a CFString. At this point - * advancing a single character in the CFString and advancing a single character - * using g_utf8_next_char in the const char string goes out of sync. * # We currently don't bother about LTR, Pango core appears to fix this up for us. * (Even when we cared warnings were generated that strings were in the wrong * order, this should be investigated). -- 1.7.4.4