From c71cec8834bf30032a8e49d2949f6d8d4080b639 Mon Sep 17 00:00:00 2001 From: David Wood Date: Sun, 14 Jan 2018 17:29:07 +0000 Subject: end_point handling multibyte characters correctly. --- src/libsyntax/codemap.rs | 37 +++++++++++++++++++++++++++++++++++++ src/libsyntax/parse/parser.rs | 8 +++++--- 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'src/libsyntax') diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index a58a61c3636..e74066da0ac 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -25,6 +25,7 @@ pub use self::ExpnFormat::*; use rustc_data_structures::fx::FxHashMap; use rustc_data_structures::stable_hasher::StableHasher; use std::cell::{RefCell, Ref}; +use std::cmp; use std::hash::Hash; use std::path::{Path, PathBuf}; use std::rc::Rc; @@ -607,6 +608,42 @@ impl CodeMap { self.span_until_char(sp, '{') } + /// Returns a new span representing just the end-point of this span + pub fn end_point(&self, sp: Span) -> Span { + let hi = sp.hi().0.checked_sub(1).unwrap_or(sp.hi().0); + let hi = self.get_start_of_char_bytepos(BytePos(hi)); + let lo = cmp::max(hi.0, sp.lo().0); + sp.with_lo(BytePos(lo)) + } + + /// Returns a new span representing the next character after the end-point of this span + pub fn next_point(&self, sp: Span) -> Span { + let hi = sp.lo().0.checked_add(1).unwrap_or(sp.lo().0); + let hi = self.get_start_of_char_bytepos(BytePos(hi)); + let lo = cmp::max(sp.hi().0, hi.0); + Span::new(BytePos(lo), BytePos(lo), sp.ctxt()) + } + + fn get_start_of_char_bytepos(&self, bpos: BytePos) -> BytePos { + let idx = self.lookup_filemap_idx(bpos); + let files = self.files.borrow(); + let map = &(*files)[idx]; + + for mbc in map.multibyte_chars.borrow().iter() { + if mbc.pos < bpos { + if bpos.to_usize() >= mbc.pos.to_usize() + mbc.bytes { + // If we do, then return the start of the character. + return mbc.pos; + } + } else { + break; + } + } + + // If this isn't a multibyte character, return the original position. + return bpos; + } + pub fn get_filemap(&self, filename: &FileName) -> Option> { for fm in self.files.borrow().iter() { if *filename == fm.name { diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index d393cab4718..e8e87e2854b 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -704,13 +704,15 @@ impl<'a> Parser<'a> { expect.clone() }; (format!("expected one of {}, found `{}`", expect, actual), - (self.prev_span.next_point(), format!("expected one of {} here", short_expect))) + (self.sess.codemap().next_point(self.prev_span), + format!("expected one of {} here", short_expect))) } else if expected.is_empty() { (format!("unexpected token: `{}`", actual), (self.prev_span, "unexpected token after this".to_string())) } else { (format!("expected {}, found `{}`", expect, actual), - (self.prev_span.next_point(), format!("expected {} here", expect))) + (self.sess.codemap().next_point(self.prev_span), + format!("expected {} here", expect))) }; let mut err = self.fatal(&msg_exp); let sp = if self.token == token::Token::Eof { @@ -3190,7 +3192,7 @@ impl<'a> Parser<'a> { // return. This won't catch blocks with an explicit `return`, but that would be caught by // the dead code lint. if self.eat_keyword(keywords::Else) || !cond.returns() { - let sp = lo.next_point(); + let sp = self.sess.codemap().next_point(lo); let mut err = self.diagnostic() .struct_span_err(sp, "missing condition for `if` statemement"); err.span_label(sp, "expected if condition here"); -- cgit 1.4.1-3-g733a5 From 62356471b3746012df74db22479b03ad1f6ab8ab Mon Sep 17 00:00:00 2001 From: David Wood Date: Tue, 16 Jan 2018 20:41:00 +0000 Subject: Replaced multi-byte character handling in end_point with potentially more performant variant. --- src/libsyntax/codemap.rs | 82 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 23 deletions(-) (limited to 'src/libsyntax') diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index e74066da0ac..76050f8dc09 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -610,38 +610,74 @@ impl CodeMap { /// Returns a new span representing just the end-point of this span pub fn end_point(&self, sp: Span) -> Span { - let hi = sp.hi().0.checked_sub(1).unwrap_or(sp.hi().0); - let hi = self.get_start_of_char_bytepos(BytePos(hi)); - let lo = cmp::max(hi.0, sp.lo().0); - sp.with_lo(BytePos(lo)) + let pos = sp.hi().0; + + let width = self.find_width_of_character_at_span(sp, false); + let corrected_end_position = pos.checked_sub(width).unwrap_or(pos); + + let end_point = BytePos(cmp::max(corrected_end_position, sp.lo().0)); + sp.with_lo(end_point) } /// Returns a new span representing the next character after the end-point of this span pub fn next_point(&self, sp: Span) -> Span { - let hi = sp.lo().0.checked_add(1).unwrap_or(sp.lo().0); - let hi = self.get_start_of_char_bytepos(BytePos(hi)); - let lo = cmp::max(sp.hi().0, hi.0); - Span::new(BytePos(lo), BytePos(lo), sp.ctxt()) + let pos = sp.lo().0; + + let width = self.find_width_of_character_at_span(sp, true); + let corrected_next_position = pos.checked_add(width).unwrap_or(pos); + + let next_point = BytePos(cmp::max(sp.hi().0, corrected_next_position)); + Span::new(next_point, next_point, sp.ctxt()) } - fn get_start_of_char_bytepos(&self, bpos: BytePos) -> BytePos { - let idx = self.lookup_filemap_idx(bpos); - let files = self.files.borrow(); - let map = &(*files)[idx]; + /// Finds the width of a character, either before or after the provided span. + fn find_width_of_character_at_span(&self, sp: Span, forwards: bool) -> u32 { + // Disregard malformed spans and assume a one-byte wide character. + if sp.lo() > sp.hi() { + return 1; + } - for mbc in map.multibyte_chars.borrow().iter() { - if mbc.pos < bpos { - if bpos.to_usize() >= mbc.pos.to_usize() + mbc.bytes { - // If we do, then return the start of the character. - return mbc.pos; - } - } else { - break; - } + let local_begin = self.lookup_byte_offset(sp.lo()); + let local_end = self.lookup_byte_offset(sp.hi()); + + let start_index = local_begin.pos.to_usize(); + let end_index = local_end.pos.to_usize(); + + // Disregard indexes that are at the start or end of their spans, they can't fit bigger + // characters. + if (!forwards && end_index == usize::min_value()) || + (forwards && start_index == usize::max_value()) { + return 1; + } + + let source_len = (local_begin.fm.end_pos - local_begin.fm.start_pos).to_usize(); + // Ensure indexes are also not malformed. + if start_index > end_index || end_index > source_len { + return 1; } - // If this isn't a multibyte character, return the original position. - return bpos; + // We need to extend the snippet to the end of the src rather than to end_index so when + // searching forwards for boundaries we've got somewhere to search. + let snippet = if let Some(ref src) = local_begin.fm.src { + let len = src.len(); + (&src[start_index..len]).to_string() + } else if let Some(src) = local_begin.fm.external_src.borrow().get_source() { + let len = src.len(); + (&src[start_index..len]).to_string() + } else { + return 1; + }; + + let mut target = if forwards { end_index + 1 } else { end_index - 1 }; + while !snippet.is_char_boundary(target - start_index) { + target = if forwards { target + 1 } else { target - 1 }; + } + + if forwards { + (target - end_index) as u32 + } else { + (end_index - target) as u32 + } } pub fn get_filemap(&self, filename: &FileName) -> Option> { -- cgit 1.4.1-3-g733a5 From be465b0b85746b2f56dc4bb1842e603e8489a0f3 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 17 Jan 2018 10:01:57 +0000 Subject: next_point now handles creating spans over multibyte characters. --- src/libsyntax/codemap.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'src/libsyntax') diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 76050f8dc09..cfb891f0faa 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -621,13 +621,17 @@ impl CodeMap { /// Returns a new span representing the next character after the end-point of this span pub fn next_point(&self, sp: Span) -> Span { - let pos = sp.lo().0; + let start_of_next_point = sp.hi().0; let width = self.find_width_of_character_at_span(sp, true); - let corrected_next_position = pos.checked_add(width).unwrap_or(pos); - - let next_point = BytePos(cmp::max(sp.hi().0, corrected_next_position)); - Span::new(next_point, next_point, sp.ctxt()) + // If the width is 1, then the next span should point to the same `lo` and `hi`. However, + // in the case of a multibyte character, where the width != 1, the next span should + // span multiple bytes to include the whole character. + let end_of_next_point = start_of_next_point.checked_add( + width - 1).unwrap_or(start_of_next_point); + + let end_of_next_point = BytePos(cmp::max(sp.lo().0 + 1, end_of_next_point)); + Span::new(BytePos(start_of_next_point), end_of_next_point, sp.ctxt()) } /// Finds the width of a character, either before or after the provided span. -- cgit 1.4.1-3-g733a5 From 0c467d5d0924e705c8a4b84b250e127c62222239 Mon Sep 17 00:00:00 2001 From: David Wood Date: Wed, 17 Jan 2018 16:41:58 +0000 Subject: Now handling case where span has same lo and hi. --- src/libsyntax/codemap.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src/libsyntax') diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index cfb891f0faa..8c1bdab28a9 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -637,7 +637,7 @@ impl CodeMap { /// Finds the width of a character, either before or after the provided span. fn find_width_of_character_at_span(&self, sp: Span, forwards: bool) -> u32 { // Disregard malformed spans and assume a one-byte wide character. - if sp.lo() > sp.hi() { + if sp.lo() >= sp.hi() { return 1; } @@ -671,11 +671,16 @@ impl CodeMap { } else { return 1; }; + debug!("DTW start {:?} end {:?}", start_index, end_index); + debug!("DTW snippet {:?}", snippet); let mut target = if forwards { end_index + 1 } else { end_index - 1 }; + debug!("DTW initial target {:?}", target); while !snippet.is_char_boundary(target - start_index) { target = if forwards { target + 1 } else { target - 1 }; + debug!("DTW update target {:?}", target); } + debug!("DTW final target {:?}", target); if forwards { (target - end_index) as u32 -- cgit 1.4.1-3-g733a5 From 0bd96671f0312fdc1eb07885835e58d258f1f927 Mon Sep 17 00:00:00 2001 From: David Wood Date: Sat, 27 Jan 2018 13:30:34 +0000 Subject: Fixed infinite loop issues and added some improved logging. --- src/libsyntax/codemap.rs | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'src/libsyntax') diff --git a/src/libsyntax/codemap.rs b/src/libsyntax/codemap.rs index 8c1bdab28a9..a6a7f9e20b3 100644 --- a/src/libsyntax/codemap.rs +++ b/src/libsyntax/codemap.rs @@ -638,25 +638,33 @@ impl CodeMap { fn find_width_of_character_at_span(&self, sp: Span, forwards: bool) -> u32 { // Disregard malformed spans and assume a one-byte wide character. if sp.lo() >= sp.hi() { + debug!("find_width_of_character_at_span: early return malformed span"); return 1; } let local_begin = self.lookup_byte_offset(sp.lo()); let local_end = self.lookup_byte_offset(sp.hi()); + debug!("find_width_of_character_at_span: local_begin=`{:?}`, local_end=`{:?}`", + local_begin, local_end); let start_index = local_begin.pos.to_usize(); let end_index = local_end.pos.to_usize(); + debug!("find_width_of_character_at_span: start_index=`{:?}`, end_index=`{:?}`", + start_index, end_index); // Disregard indexes that are at the start or end of their spans, they can't fit bigger // characters. if (!forwards && end_index == usize::min_value()) || (forwards && start_index == usize::max_value()) { + debug!("find_width_of_character_at_span: start or end of span, cannot be multibyte"); return 1; } let source_len = (local_begin.fm.end_pos - local_begin.fm.start_pos).to_usize(); + debug!("find_width_of_character_at_span: source_len=`{:?}`", source_len); // Ensure indexes are also not malformed. if start_index > end_index || end_index > source_len { + debug!("find_width_of_character_at_span: source indexes are malformed"); return 1; } @@ -671,16 +679,22 @@ impl CodeMap { } else { return 1; }; - debug!("DTW start {:?} end {:?}", start_index, end_index); - debug!("DTW snippet {:?}", snippet); + debug!("find_width_of_character_at_span: snippet=`{:?}`", snippet); + + let file_start_pos = local_begin.fm.start_pos.to_usize(); + let file_end_pos = local_begin.fm.end_pos.to_usize(); + debug!("find_width_of_character_at_span: file_start_pos=`{:?}` file_end_pos=`{:?}`", + file_start_pos, file_end_pos); let mut target = if forwards { end_index + 1 } else { end_index - 1 }; - debug!("DTW initial target {:?}", target); - while !snippet.is_char_boundary(target - start_index) { + debug!("find_width_of_character_at_span: initial target=`{:?}`", target); + + while !snippet.is_char_boundary(target - start_index) + && target >= file_start_pos && target <= file_end_pos { target = if forwards { target + 1 } else { target - 1 }; - debug!("DTW update target {:?}", target); + debug!("find_width_of_character_at_span: target=`{:?}`", target); } - debug!("DTW final target {:?}", target); + debug!("find_width_of_character_at_span: final target=`{:?}`", target); if forwards { (target - end_index) as u32 -- cgit 1.4.1-3-g733a5