diff options
| author | bors <bors@rust-lang.org> | 2014-10-04 19:32:07 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2014-10-04 19:32:07 +0000 |
| commit | a2e7c4da9b331d337fba0b3911c6d3d7f48e8305 (patch) | |
| tree | 934452ca7ce6287103d64b086e44feac3a219b71 | |
| parent | e434aa1cf737b050ef79c3d7a0a7eca48c7f8e49 (diff) | |
| parent | 3aea7f18894bfc35c03044688229f6de84eb42f3 (diff) | |
| download | rust-a2e7c4da9b331d337fba0b3911c6d3d7f48e8305.tar.gz rust-a2e7c4da9b331d337fba0b3911c6d3d7f48e8305.zip | |
auto merge of #17738 : hoeppnertill/rust/master, r=alexcrichton
There is an issue with lev_distance, where
```
fn main() {
println!("{}", "\x80".lev_distance("\x80"))
}
```
prints `2`.
This is due to using the byte length instead of the char length.
| -rw-r--r-- | src/libcollections/str.rs | 37 |
1 files changed, 28 insertions, 9 deletions
diff --git a/src/libcollections/str.rs b/src/libcollections/str.rs index d198e948ac8..553b34a55c3 100644 --- a/src/libcollections/str.rs +++ b/src/libcollections/str.rs @@ -778,13 +778,11 @@ pub trait StrAllocating: Str { /// Returns the Levenshtein Distance between two strings. fn lev_distance(&self, t: &str) -> uint { let me = self.as_slice(); - let slen = me.len(); - let tlen = t.len(); + if me.is_empty() { return t.char_len(); } + if t.is_empty() { return me.char_len(); } - if slen == 0 { return tlen; } - if tlen == 0 { return slen; } - - let mut dcol = Vec::from_fn(tlen + 1, |x| x); + let mut dcol = Vec::from_fn(t.len() + 1, |x| x); + let mut t_last = 0; for (i, sc) in me.chars().enumerate() { @@ -799,15 +797,15 @@ pub trait StrAllocating: Str { *dcol.get_mut(j + 1) = current; } else { *dcol.get_mut(j + 1) = cmp::min(current, next); - *dcol.get_mut(j + 1) = cmp::min(dcol[j + 1], - dcol[j]) + 1; + *dcol.get_mut(j + 1) = cmp::min(dcol[j + 1], dcol[j]) + 1; } current = next; + t_last = j; } } - return dcol[tlen]; + dcol[t_last + 1] } /// Returns an iterator over the string in Unicode Normalization Form D @@ -1879,6 +1877,27 @@ mod tests { } #[test] + fn test_lev_distance() { + use std::char::{ from_u32, MAX }; + // Test bytelength agnosticity + for c in range(0u32, MAX as u32) + .filter_map(|i| from_u32(i)) + .map(|i| String::from_char(1, i)) { + assert_eq!(c[].lev_distance(c[]), 0); + } + + let a = "\nMäry häd ä little lämb\n\nLittle lämb\n"; + let b = "\nMary häd ä little lämb\n\nLittle lämb\n"; + let c = "Mary häd ä little lämb\n\nLittle lämb\n"; + assert_eq!(a.lev_distance(b), 1); + assert_eq!(b.lev_distance(a), 1); + assert_eq!(a.lev_distance(c), 2); + assert_eq!(c.lev_distance(a), 2); + assert_eq!(b.lev_distance(c), 1); + assert_eq!(c.lev_distance(b), 1); + } + + #[test] fn test_nfd_chars() { macro_rules! t { ($input: expr, $expected: expr) => { |
