661f8c0b920f5da Mon Sep 17 00:00:00 2001
From: Kristian Rietveld <kris@lanedo.com>
Date: Tue, 19 Mar 2013 11:23:49 +0100
Subject: [PATCH 2/2] Detect and handle characters encoded in two UTF16 code
 points

Another important change: gi->index should point at the current
character, not the current into the string. Before this change,
the current character equaled the current index into the string.
---
 modules/basic/basic-coretext.c |   55 ++++++++++++++++++++++++++++-----------
 1 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/modules/basic/basic-coretext.c b/modules/basic/basic-coretext.c
index 33ce479..06b648e 100644
--- a/modules/basic/basic-coretext.c
+++ b/modules/basic/basic-coretext.c
@@ -166,7 +166,42 @@ run_iterator_run_is_non_monotonic (struct RunIterator *iter)
 static gunichar
 run_iterator_get_character (struct RunIterator *iter)
 {
-  return CFStringGetCharacterAtIndex (iter->cstr, iter->current_indices[iter->ct_i]);
+  int lower, upper;
+
+  lower = iter->current_indices[iter->ct_i];
+  if (iter->ct_i + 1 < CTRunGetGlyphCount (iter->current_run))
+    upper = iter->current_indices[iter->ct_i + 1];
+  else
+    {
+      CFRange range = CTRunGetStringRange (iter->current_run);
+      upper = range.location + range.length;
+    }
+
+  if (upper - lower == 1)
+    return CFStringGetCharacterAtIndex (iter->cstr, lower);
+  if (upper - lower == 2)
+    {
+      /* Character is encoded in two UTF16 code points. */
+      gunichar *ch;
+      gunichar retval;
+      gunichar2 orig[2];
+
+      orig[0] = CFStringGetCharacterAtIndex (iter->cstr, lower);
+      orig[1] = CFStringGetCharacterAtIndex (iter->cstr, lower + 1);
+
+      ch = g_utf16_to_ucs4 (orig, 2, NULL, NULL, NULL);
+      retval = *ch;
+      g_free (ch);
+
+      return retval;
+    }
+
+  /* This should not be reached, because other cases cannot occur. Instead
+   * of crashing, return the first character which will likely be displayed
+   * as unknown glyph.
+   */
+
+  return CFStringGetCharacterAtIndex (iter->cstr, lower);
 }

 static CGGlyph
@@ -175,12 +210,6 @@ run_iterator_get_cgglyph (struct RunIterator *iter)
   return iter->current_cgglyphs[iter->ct_i];
 }

-static CFIndex
-run_iterator_get_index (struct RunIterator *iter)
-{
-  return iter->current_indices[iter->ct_i];
-}
-
 static gboolean
 run_iterator_create (struct RunIterator *iter,
                      const char         *text,
@@ -336,7 +365,7 @@ create_core_text_glyph_list (const char *text,
       struct GlyphInfo *gi;

       gi = g_slice_new (struct GlyphInfo);
-      gi->index = run_iterator_get_index (&riter);
+      gi->index = riter.total_ct_i;
       gi->cgglyph = run_iterator_get_cgglyph (&riter);
       gi->wc = run_iterator_get_character (&riter);

@@ -376,9 +405,8 @@ basic_engine_shape (PangoEngineShape    *engine,
    * glyph sequence generated by the CoreText typesetter:
    *   # E.g. zero-width spaces do not end up in the CoreText glyph sequence. We have
    *     to manually account for the gap in the character indices.
-   *   # Sometimes, CoreText generates two glyph for the same character index. We
-   *     currently handle this "properly" as in we do not crash or corrupt memory,
-   *     but that's about it.
+   *   # Sometimes, CoreText generates two glyph for the same character index. These
+   *     are properly composed into a single 32-bit gunichar.
    *   # Due to mismatches in size, the CoreText glyph sequence can either be longer or
    *     shorter than the PangoGlyphString. Note that the size of the PangoGlyphString
    *     should match the number of characters in "text".
@@ -390,11 +418,6 @@ basic_engine_shape (PangoEngineShape    *engine,
    * increasing/decreasing.
    *
    * FIXME items for future fixing:
-   *   # CoreText strings are UTF16, and the indices *often* refer to characters,
-   *     but not *always*. Notable exception is when a character is encoded using
-   *     two UTF16 code points. This are two characters in a CFString. At this point
-   *     advancing a single character in the CFString and advancing a single character
-   *     using g_utf8_next_char in the const char string goes out of sync.
    *   # We currently don't bother about LTR, Pango core appears to fix this up for us.
    *     (Even when we cared warnings were generated that strings were in the wrong
    *     order, this should be investigated).
--
1.7.4.4