about summary refs log tree commit diff
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2014-10-04 19:32:07 +0000
committerbors <bors@rust-lang.org>2014-10-04 19:32:07 +0000
commita2e7c4da9b331d337fba0b3911c6d3d7f48e8305 (patch)
tree934452ca7ce6287103d64b086e44feac3a219b71
parente434aa1cf737b050ef79c3d7a0a7eca48c7f8e49 (diff)
parent3aea7f18894bfc35c03044688229f6de84eb42f3 (diff)
downloadrust-a2e7c4da9b331d337fba0b3911c6d3d7f48e8305.tar.gz
rust-a2e7c4da9b331d337fba0b3911c6d3d7f48e8305.zip
auto merge of #17738 : hoeppnertill/rust/master, r=alexcrichton
There is an issue with lev_distance, where
```
fn main() {
    println!("{}", "\x80".lev_distance("\x80"))
}
```
prints `2`.

This is due to using the byte length instead of the char length.
-rw-r--r--src/libcollections/str.rs37
1 files changed, 28 insertions, 9 deletions
diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs
index d198e948ac8..553b34a55c3 100644
--- a/src/libcollections/str.rs
+++ b/src/libcollections/str.rs
@@ -778,13 +778,11 @@ pub trait StrAllocating: Str {
     /// Returns the Levenshtein Distance between two strings.
     fn lev_distance(&self, t: &str) -> uint {
         let me = self.as_slice();
-        let slen = me.len();
-        let tlen = t.len();
+        if me.is_empty() { return t.char_len(); }
+        if t.is_empty() { return me.char_len(); }
 
-        if slen == 0 { return tlen; }
-        if tlen == 0 { return slen; }
-
-        let mut dcol = Vec::from_fn(tlen + 1, |x| x);
+        let mut dcol = Vec::from_fn(t.len() + 1, |x| x);
+        let mut t_last = 0;
 
         for (i, sc) in me.chars().enumerate() {
 
@@ -799,15 +797,15 @@ pub trait StrAllocating: Str {
                     *dcol.get_mut(j + 1) = current;
                 } else {
                     *dcol.get_mut(j + 1) = cmp::min(current, next);
-                    *dcol.get_mut(j + 1) = cmp::min(dcol[j + 1],
-                                                    dcol[j]) + 1;
+                    *dcol.get_mut(j + 1) = cmp::min(dcol[j + 1], dcol[j]) + 1;
                 }
 
                 current = next;
+                t_last = j;
             }
         }
 
-        return dcol[tlen];
+        dcol[t_last + 1]
     }
 
     /// Returns an iterator over the string in Unicode Normalization Form D
@@ -1879,6 +1877,27 @@ mod tests {
     }
 
     #[test]
+    fn test_lev_distance() {
+        use std::char::{ from_u32, MAX };
+        // Test bytelength agnosticity
+        for c in range(0u32, MAX as u32)
+                 .filter_map(|i| from_u32(i))
+                 .map(|i| String::from_char(1, i)) {
+            assert_eq!(c[].lev_distance(c[]), 0);
+        }
+
+        let a = "\nMäry häd ä little lämb\n\nLittle lämb\n";
+        let b = "\nMary häd ä little lämb\n\nLittle lämb\n";
+        let c = "Mary häd ä little lämb\n\nLittle lämb\n";
+        assert_eq!(a.lev_distance(b), 1);
+        assert_eq!(b.lev_distance(a), 1);
+        assert_eq!(a.lev_distance(c), 2);
+        assert_eq!(c.lev_distance(a), 2);
+        assert_eq!(b.lev_distance(c), 1);
+        assert_eq!(c.lev_distance(b), 1);
+    }
+
+    #[test]
     fn test_nfd_chars() {
         macro_rules! t {
             ($input: expr, $expected: expr) => {