about summary refs log tree commit diff
path: root/src/libunicode/decompose.rs
blob: 832b65d473996d4e642e23d996b57d630eb50816 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

/*!
  Functions for computing canonical and compatible decompositions
  for Unicode characters.
  */

use core::option::{Option, Some, None};
use core::slice::ImmutableVector;
use tables::normalization::{canonical_table, compatibility_table};

fn bsearch_table(c: char, r: &'static [(char, &'static [char])]) -> Option<&'static [char]> {
    use core::cmp::{Equal, Less, Greater};
    match r.bsearch(|&(val, _)| {
        if c == val { Equal }
        else if val < c { Less }
        else { Greater }
    }) {
        Some(idx) => {
            let (_, result) = r[idx];
            Some(result)
        }
        None => None
    }
}

/// Compute canonical Unicode decomposition for character
pub fn decompose_canonical(c: char, i: |char|) { d(c, i, false); }

/// Compute canonical or compatible Unicode decomposition for character
pub fn decompose_compatible(c: char, i: |char|) { d(c, i, true); }

fn d(c: char, i: |char|, k: bool) {
    use core::iter::Iterator;

    // 7-bit ASCII never decomposes
    if c <= '\x7f' { i(c); return; }

    // Perform decomposition for Hangul
    if (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) {
        decompose_hangul(c, i);
        return;
    }

    // First check the canonical decompositions
    match bsearch_table(c, canonical_table) {
        Some(canon) => {
            for x in canon.iter() {
                d(*x, |b| i(b), k);
            }
            return;
        }
        None => ()
    }

    // Bottom out if we're not doing compat.
    if !k { i(c); return; }

    // Then check the compatibility decompositions
    match bsearch_table(c, compatibility_table) {
        Some(compat) => {
            for x in compat.iter() {
                d(*x, |b| i(b), k);
            }
            return;
        }
        None => ()
    }

    // Finally bottom out.
    i(c);
}

// Constants from Unicode 6.3.0 Section 3.12 Conjoining Jamo Behavior
static S_BASE: u32 = 0xAC00;
static L_BASE: u32 = 0x1100;
static V_BASE: u32 = 0x1161;
static T_BASE: u32 = 0x11A7;
static L_COUNT: u32 = 19;
static V_COUNT: u32 = 21;
static T_COUNT: u32 = 28;
static N_COUNT: u32 = (V_COUNT * T_COUNT);
static S_COUNT: u32 = (L_COUNT * N_COUNT);

// Decompose a precomposed Hangul syllable
fn decompose_hangul(s: char, f: |char|) {
    use core::mem::transmute;

    let si = s as u32 - S_BASE;

    let li = si / N_COUNT;
    unsafe {
        f(transmute(L_BASE + li));

        let vi = (si % N_COUNT) / T_COUNT;
        f(transmute(V_BASE + vi));

        let ti = si % T_COUNT;
        if ti > 0 {
            f(transmute(T_BASE + ti));
        }
    }
}