diff options
Diffstat (limited to 'src/librbml/lib.rs')
| -rw-r--r-- | src/librbml/lib.rs | 636 |
1 files changed, 438 insertions, 198 deletions
diff --git a/src/librbml/lib.rs b/src/librbml/lib.rs index 05cd24de736..844d097bdaf 100644 --- a/src/librbml/lib.rs +++ b/src/librbml/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // @@ -8,12 +8,108 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Really Bad Markup Language (rbml) is a temporary measure until we migrate -//! the rust object metadata to a better serialization format. It is not -//! intended to be used by users. +//! Really Bad Markup Language (rbml) is an internal serialization format of rustc. +//! This is not intended to be used by users. //! -//! It is loosely based on the Extensible Binary Markup Language (ebml): -//! http://www.matroska.org/technical/specs/rfc/index.html +//! Originally based on the Extensible Binary Markup Language +//! (ebml; http://www.matroska.org/technical/specs/rfc/index.html), +//! it is now a separate format tuned for the rust object metadata. +//! +//! # Encoding +//! +//! RBML document consists of the tag, length and data. +//! The encoded data can contain multiple RBML documents concatenated. +//! +//! **Tags** are a hint for the following data. +//! Tags are a number from 0x000 to 0xfff, where 0xf0 through 0xff is reserved. +//! Tags less than 0xf0 are encoded in one literal byte. +//! Tags greater than 0xff are encoded in two big-endian bytes, +//! where the tag number is ORed with 0xf000. (E.g. tag 0x123 = `f1 23`) +//! +//! **Lengths** encode the length of the following data. +//! It is a variable-length unsigned int, and one of the following forms: +//! +//! - `80` through `fe` for lengths up to 0x7e; +//! - `40 ff` through `7f ff` for lengths up to 0x3fff; +//! - `20 40 00` through `3f ff ff` for lengths up to 0x1fffff; +//! - `10 20 00 00` through `1f ff ff ff` for lengths up to 0xfffffff. +//! +//! The "overlong" form is allowed so that the length can be encoded +//! without the prior knowledge of the encoded data. +//! For example, the length 0 can be represented either by `80`, `40 00`, +//! `20 00 00` or `10 00 00 00`. +//! The encoder tries to minimize the length if possible. +//! Also, some predefined tags listed below are so commonly used that +//! their lengths are omitted ("implicit length"). +//! +//! **Data** can be either binary bytes or zero or more nested RBML documents. +//! Nested documents cannot overflow, and should be entirely contained +//! within a parent document. +//! +//! # Predefined Tags +//! +//! Most RBML tags are defined by the application. +//! (For the rust object metadata, see also `rustc::metadata::common`.) +//! RBML itself does define a set of predefined tags however, +//! intended for the auto-serialization implementation. +//! +//! Predefined tags with an implicit length: +//! +//! - `U8` (`00`): 1-byte unsigned integer. +//! - `U16` (`01`): 2-byte big endian unsigned integer. +//! - `U32` (`02`): 4-byte big endian unsigned integer. +//! - `U64` (`03`): 8-byte big endian unsigned integer. +//! Any of `U*` tags can be used to encode primitive unsigned integer types, +//! as long as it is no greater than the actual size. +//! For example, `u8` can only be represented via the `U8` tag. +//! +//! - `I8` (`04`): 1-byte signed integer. +//! - `I16` (`05`): 2-byte big endian signed integer. +//! - `I32` (`06`): 4-byte big endian signed integer. +//! - `I64` (`07`): 8-byte big endian signed integer. +//! Similar to `U*` tags. Always uses two's complement encoding. +//! +//! - `Bool` (`08`): 1-byte boolean value, `00` for false and `01` for true. +//! +//! - `Char` (`09`): 4-byte big endian Unicode scalar value. +//! Surrogate pairs or out-of-bound values are invalid. +//! +//! - `F32` (`0a`): 4-byte big endian unsigned integer representing +//! IEEE 754 binary32 floating-point format. +//! - `F64` (`0b`): 8-byte big endian unsigned integer representing +//! IEEE 754 binary64 floating-point format. +//! +//! - `Sub8` (`0c`): 1-byte unsigned integer for supplementary information. +//! - `Sub32` (`0d`): 4-byte unsigned integer for supplementary information. +//! Those two tags normally occur as the first subdocument of certain tags, +//! namely `Enum`, `Vec` and `Map`, to provide a variant or size information. +//! They can be used interchangably. +//! +//! Predefined tags with an explicit length: +//! +//! - `Str` (`10`): A UTF-8-encoded string. +//! +//! - `Enum` (`11`): An enum. +//! The first subdocument should be `Sub*` tags with a variant ID. +//! Subsequent subdocuments, if any, encode variant arguments. +//! +//! - `Vec` (`12`): A vector (sequence). +//! - `VecElt` (`13`): A vector element. +//! The first subdocument should be `Sub*` tags with the number of elements. +//! Subsequent subdocuments should be `VecElt` tag per each element. +//! +//! - `Map` (`14`): A map (associated array). +//! - `MapKey` (`15`): A key part of the map entry. +//! - `MapVal` (`16`): A value part of the map entry. +//! The first subdocument should be `Sub*` tags with the number of entries. +//! Subsequent subdocuments should be an alternating sequence of +//! `MapKey` and `MapVal` tags per each entry. +//! +//! - `Opaque` (`17`): An opaque, custom-format tag. +//! Used to wrap ordinary custom tags or data in the auto-serialized context. +//! Rustc typically uses this to encode type informations. +//! +//! First 0x20 tags are reserved by RBML; custom tags start at 0x20. #![crate_name = "rbml"] #![unstable(feature = "rustc_private")] @@ -64,6 +160,10 @@ impl<'doc> Doc<'doc> { reader::get_doc(*self, tag) } + pub fn is_empty(&self) -> bool { + self.start == self.end + } + pub fn as_str_slice<'a>(&'a self) -> &'a str { str::from_utf8(&self.data[self.start..self.end]).unwrap() } @@ -80,41 +180,51 @@ pub struct TaggedDoc<'a> { #[derive(Copy, Debug)] pub enum EbmlEncoderTag { - EsUint, // 0 - EsU64, // 1 - EsU32, // 2 - EsU16, // 3 - EsU8, // 4 - EsInt, // 5 - EsI64, // 6 - EsI32, // 7 - EsI16, // 8 - EsI8, // 9 - EsBool, // 10 - EsChar, // 11 - EsStr, // 12 - EsF64, // 13 - EsF32, // 14 - EsFloat, // 15 - EsEnum, // 16 - EsEnumVid, // 17 - EsEnumBody, // 18 - EsVec, // 19 - EsVecLen, // 20 - EsVecElt, // 21 - EsMap, // 22 - EsMapLen, // 23 - EsMapKey, // 24 - EsMapVal, // 25 - - EsOpaque, - - EsLabel, // Used only when debugging + // tags 00..1f are reserved for auto-serialization. + // first NUM_IMPLICIT_TAGS tags are implicitly sized and lengths are not encoded. + + EsU8 = 0x00, // + 1 byte + EsU16 = 0x01, // + 2 bytes + EsU32 = 0x02, // + 4 bytes + EsU64 = 0x03, // + 8 bytes + EsI8 = 0x04, // + 1 byte + EsI16 = 0x05, // + 2 bytes + EsI32 = 0x06, // + 4 bytes + EsI64 = 0x07, // + 8 bytes + EsBool = 0x08, // + 1 byte + EsChar = 0x09, // + 4 bytes + EsF32 = 0x0a, // + 4 bytes + EsF64 = 0x0b, // + 8 bytes + EsSub8 = 0x0c, // + 1 byte + EsSub32 = 0x0d, // + 4 bytes + // 0x0e and 0x0f are reserved + + EsStr = 0x10, + EsEnum = 0x11, // encodes the variant id as the first EsSub* + EsVec = 0x12, // encodes the # of elements as the first EsSub* + EsVecElt = 0x13, + EsMap = 0x14, // encodes the # of pairs as the first EsSub* + EsMapKey = 0x15, + EsMapVal = 0x16, + EsOpaque = 0x17, } +const NUM_TAGS: uint = 0x1000; +const NUM_IMPLICIT_TAGS: uint = 0x0e; + +static TAG_IMPLICIT_LEN: [i8; NUM_IMPLICIT_TAGS] = [ + 1, 2, 4, 8, // EsU* + 1, 2, 4, 8, // ESI* + 1, // EsBool + 4, // EsChar + 4, 8, // EsF* + 1, 4, // EsSub* +]; + #[derive(Debug)] pub enum Error { IntTooBig(uint), + InvalidTag(uint), Expected(String), IoError(std::old_io::IoError), ApplicationError(String) @@ -138,11 +248,11 @@ pub mod reader { use serialize; - use super::{ ApplicationError, EsVec, EsMap, EsEnum, EsVecLen, EsVecElt, - EsMapLen, EsMapKey, EsEnumVid, EsU64, EsU32, EsU16, EsU8, EsInt, EsI64, + use super::{ ApplicationError, EsVec, EsMap, EsEnum, EsSub8, EsSub32, + EsVecElt, EsMapKey, EsU64, EsU32, EsU16, EsU8, EsI64, EsI32, EsI16, EsI8, EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, - EsEnumBody, EsUint, EsOpaque, EsLabel, EbmlEncoderTag, Doc, TaggedDoc, - Error, IntTooBig, Expected }; + EsOpaque, EbmlEncoderTag, Doc, TaggedDoc, + Error, IntTooBig, InvalidTag, Expected, NUM_IMPLICIT_TAGS, TAG_IMPLICIT_LEN }; pub type DecodeResult<T> = Result<T, Error>; // rbml reading @@ -165,6 +275,18 @@ pub mod reader { pub next: uint } + pub fn tag_at(data: &[u8], start: uint) -> DecodeResult<Res> { + let v = data[start] as uint; + if v < 0xf0 { + Ok(Res { val: v, next: start + 1 }) + } else if v > 0xf0 { + Ok(Res { val: ((v & 0xf) << 8) | data[start + 1] as uint, next: start + 2 }) + } else { + // every tag starting with byte 0xf0 is an overlong form, which is prohibited. + Err(InvalidTag(v)) + } + } + #[inline(never)] fn vuint_at_slow(data: &[u8], start: uint) -> DecodeResult<Res> { let a = data[start]; @@ -237,9 +359,17 @@ pub mod reader { } } + pub fn tag_len_at(data: &[u8], tag: Res) -> DecodeResult<Res> { + if tag.val < NUM_IMPLICIT_TAGS && TAG_IMPLICIT_LEN[tag.val] >= 0 { + Ok(Res { val: TAG_IMPLICIT_LEN[tag.val] as uint, next: tag.next }) + } else { + vuint_at(data, tag.next) + } + } + pub fn doc_at<'a>(data: &'a [u8], start: uint) -> DecodeResult<TaggedDoc<'a>> { - let elt_tag = try!(vuint_at(data, start)); - let elt_size = try!(vuint_at(data, elt_tag.next)); + let elt_tag = try!(tag_at(data, start)); + let elt_size = try!(tag_len_at(data, elt_tag)); let end = elt_size.next + elt_size.val; Ok(TaggedDoc { tag: elt_tag.val, @@ -250,8 +380,8 @@ pub mod reader { pub fn maybe_get_doc<'a>(d: Doc<'a>, tg: uint) -> Option<Doc<'a>> { let mut pos = d.start; while pos < d.end { - let elt_tag = try_or!(vuint_at(d.data, pos), None); - let elt_size = try_or!(vuint_at(d.data, elt_tag.next), None); + let elt_tag = try_or!(tag_at(d.data, pos), None); + let elt_size = try_or!(tag_len_at(d.data, elt_tag), None); pos = elt_size.next + elt_size.val; if elt_tag.val == tg { return Some(Doc { data: d.data, start: elt_size.next, @@ -276,8 +406,8 @@ pub mod reader { { let mut pos = d.start; while pos < d.end { - let elt_tag = try_or!(vuint_at(d.data, pos), false); - let elt_size = try_or!(vuint_at(d.data, elt_tag.next), false); + let elt_tag = try_or!(tag_at(d.data, pos), false); + let elt_size = try_or!(tag_len_at(d.data, elt_tag), false); pos = elt_size.next + elt_size.val; let doc = Doc { data: d.data, start: elt_size.next, end: pos }; if !it(elt_tag.val, doc) { @@ -292,8 +422,8 @@ pub mod reader { { let mut pos = d.start; while pos < d.end { - let elt_tag = try_or!(vuint_at(d.data, pos), false); - let elt_size = try_or!(vuint_at(d.data, elt_tag.next), false); + let elt_tag = try_or!(tag_at(d.data, pos), false); + let elt_size = try_or!(tag_len_at(d.data, elt_tag), false); pos = elt_size.next + elt_size.val; if elt_tag.val == tg { let doc = Doc { data: d.data, start: elt_size.next, @@ -357,23 +487,6 @@ pub mod reader { } } - fn _check_label(&mut self, lbl: &str) -> DecodeResult<()> { - if self.pos < self.parent.end { - let TaggedDoc { tag: r_tag, doc: r_doc } = - try!(doc_at(self.parent.data, self.pos)); - - if r_tag == (EsLabel as uint) { - self.pos = r_doc.end; - let str = r_doc.as_str_slice(); - if lbl != str { - return Err(Expected(format!("Expected label {:?} but \ - found {:?}", lbl, str))); - } - } - } - Ok(()) - } - fn next_doc(&mut self, exp_tag: EbmlEncoderTag) -> DecodeResult<Doc<'doc>> { debug!(". next_doc(exp_tag={:?})", exp_tag); if self.pos >= self.parent.end { @@ -416,10 +529,66 @@ pub mod reader { Ok(r) } - fn _next_uint(&mut self, exp_tag: EbmlEncoderTag) -> DecodeResult<uint> { - let r = doc_as_u32(try!(self.next_doc(exp_tag))); - debug!("_next_uint exp_tag={:?} result={:?}", exp_tag, r); - Ok(r as uint) + fn _next_sub(&mut self) -> DecodeResult<uint> { + // empty vector/map optimization + if self.parent.is_empty() { + return Ok(0); + } + + let TaggedDoc { tag: r_tag, doc: r_doc } = + try!(doc_at(self.parent.data, self.pos)); + let r = if r_tag == (EsSub8 as uint) { + doc_as_u8(r_doc) as uint + } else if r_tag == (EsSub32 as uint) { + doc_as_u32(r_doc) as uint + } else { + return Err(Expected(format!("expected EBML doc with tag {:?} or {:?} but \ + found tag {:?}", EsSub8, EsSub32, r_tag))); + }; + if r_doc.end > self.parent.end { + return Err(Expected(format!("invalid EBML, child extends to \ + {:#x}, parent to {:#x}", + r_doc.end, self.parent.end))); + } + self.pos = r_doc.end; + debug!("_next_sub result={:?}", r); + Ok(r) + } + + // variable-length unsigned integer with different tags. + // `first_tag` should be a tag for u8 or i8. + // `last_tag` should be the largest allowed integer tag with the matching signedness. + // all tags between them should be valid, in the order of u8, u16, u32 and u64. + fn _next_int(&mut self, + first_tag: EbmlEncoderTag, + last_tag: EbmlEncoderTag) -> DecodeResult<u64> { + if self.pos >= self.parent.end { + return Err(Expected(format!("no more documents in \ + current node!"))); + } + + let TaggedDoc { tag: r_tag, doc: r_doc } = + try!(doc_at(self.parent.data, self.pos)); + let r = if first_tag as uint <= r_tag && r_tag <= last_tag as uint { + match r_tag - first_tag as uint { + 0 => doc_as_u8(r_doc) as u64, + 1 => doc_as_u16(r_doc) as u64, + 2 => doc_as_u32(r_doc) as u64, + 3 => doc_as_u64(r_doc) as u64, + _ => unreachable!(), + } + } else { + return Err(Expected(format!("expected EBML doc with tag {:?} through {:?} but \ + found tag {:?}", first_tag, last_tag, r_tag))); + }; + if r_doc.end > self.parent.end { + return Err(Expected(format!("invalid EBML, child extends to \ + {:#x}, parent to {:#x}", + r_doc.end, self.parent.end))); + } + self.pos = r_doc.end; + debug!("_next_int({:?}, {:?}) result={:?}", first_tag, last_tag, r); + Ok(r) } pub fn read_opaque<R, F>(&mut self, op: F) -> DecodeResult<R> where @@ -443,12 +612,12 @@ pub mod reader { type Error = Error; fn read_nil(&mut self) -> DecodeResult<()> { Ok(()) } - fn read_u64(&mut self) -> DecodeResult<u64> { Ok(doc_as_u64(try!(self.next_doc(EsU64)))) } - fn read_u32(&mut self) -> DecodeResult<u32> { Ok(doc_as_u32(try!(self.next_doc(EsU32)))) } - fn read_u16(&mut self) -> DecodeResult<u16> { Ok(doc_as_u16(try!(self.next_doc(EsU16)))) } - fn read_u8 (&mut self) -> DecodeResult<u8 > { Ok(doc_as_u8 (try!(self.next_doc(EsU8 )))) } + fn read_u64(&mut self) -> DecodeResult<u64> { self._next_int(EsU8, EsU64) } + fn read_u32(&mut self) -> DecodeResult<u32> { Ok(try!(self._next_int(EsU8, EsU32)) as u32) } + fn read_u16(&mut self) -> DecodeResult<u16> { Ok(try!(self._next_int(EsU8, EsU16)) as u16) } + fn read_u8(&mut self) -> DecodeResult<u8> { Ok(doc_as_u8(try!(self.next_doc(EsU8)))) } fn read_uint(&mut self) -> DecodeResult<uint> { - let v = doc_as_u64(try!(self.next_doc(EsUint))); + let v = try!(self._next_int(EsU8, EsU64)); if v > (::std::usize::MAX as u64) { Err(IntTooBig(v as uint)) } else { @@ -456,20 +625,12 @@ pub mod reader { } } - fn read_i64(&mut self) -> DecodeResult<i64> { - Ok(doc_as_u64(try!(self.next_doc(EsI64))) as i64) - } - fn read_i32(&mut self) -> DecodeResult<i32> { - Ok(doc_as_u32(try!(self.next_doc(EsI32))) as i32) - } - fn read_i16(&mut self) -> DecodeResult<i16> { - Ok(doc_as_u16(try!(self.next_doc(EsI16))) as i16) - } - fn read_i8 (&mut self) -> DecodeResult<i8> { - Ok(doc_as_u8(try!(self.next_doc(EsI8 ))) as i8) - } + fn read_i64(&mut self) -> DecodeResult<i64> { Ok(try!(self._next_int(EsI8, EsI64)) as i64) } + fn read_i32(&mut self) -> DecodeResult<i32> { Ok(try!(self._next_int(EsI8, EsI32)) as i32) } + fn read_i16(&mut self) -> DecodeResult<i16> { Ok(try!(self._next_int(EsI8, EsI16)) as i16) } + fn read_i8(&mut self) -> DecodeResult<i8> { Ok(doc_as_u8(try!(self.next_doc(EsI8))) as i8) } fn read_int(&mut self) -> DecodeResult<int> { - let v = doc_as_u64(try!(self.next_doc(EsInt))) as i64; + let v = try!(self._next_int(EsI8, EsI64)) as i64; if v > (isize::MAX as i64) || v < (isize::MIN as i64) { debug!("FIXME \\#6122: Removing this makes this function miscompile"); Err(IntTooBig(v as uint)) @@ -502,7 +663,6 @@ pub mod reader { F: FnOnce(&mut Decoder<'doc>) -> DecodeResult<T>, { debug!("read_enum({})", name); - try!(self._check_label(name)); let doc = try!(self.next_doc(EsEnum)); @@ -522,20 +682,10 @@ pub mod reader { where F: FnMut(&mut Decoder<'doc>, uint) -> DecodeResult<T>, { debug!("read_enum_variant()"); - let idx = try!(self._next_uint(EsEnumVid)); + let idx = try!(self._next_sub()); debug!(" idx={}", idx); - let doc = try!(self.next_doc(EsEnumBody)); - - let (old_parent, old_pos) = (self.parent, self.pos); - self.parent = doc; - self.pos = self.parent.start; - - let result = try!(f(self, idx)); - - self.parent = old_parent; - self.pos = old_pos; - Ok(result) + f(self, idx) } fn read_enum_variant_arg<T, F>(&mut self, idx: uint, f: F) -> DecodeResult<T> where @@ -550,20 +700,10 @@ pub mod reader { where F: FnMut(&mut Decoder<'doc>, uint) -> DecodeResult<T>, { debug!("read_enum_struct_variant()"); - let idx = try!(self._next_uint(EsEnumVid)); + let idx = try!(self._next_sub()); debug!(" idx={}", idx); - let doc = try!(self.next_doc(EsEnumBody)); - - let (old_parent, old_pos) = (self.parent, self.pos); - self.parent = doc; - self.pos = self.parent.start; - - let result = try!(f(self, idx)); - - self.parent = old_parent; - self.pos = old_pos; - Ok(result) + f(self, idx) } fn read_enum_struct_variant_field<T, F>(&mut self, @@ -588,7 +728,6 @@ pub mod reader { F: FnOnce(&mut Decoder<'doc>) -> DecodeResult<T>, { debug!("read_struct_field(name={}, idx={})", name, idx); - try!(self._check_label(name)); f(self) } @@ -652,7 +791,7 @@ pub mod reader { { debug!("read_seq()"); self.push_doc(EsVec, move |d| { - let len = try!(d._next_uint(EsVecLen)); + let len = try!(d._next_sub()); debug!(" len={}", len); f(d, len) }) @@ -670,7 +809,7 @@ pub mod reader { { debug!("read_map()"); self.push_doc(EsMap, move |d| { - let len = try!(d._next_uint(EsMapLen)); + let len = try!(d._next_sub()); debug!(" len={}", len); f(d, len) }) @@ -701,11 +840,14 @@ pub mod writer { use std::num::Int; use std::old_io::{Writer, Seek}; use std::old_io; + use std::slice::bytes; + use std::num::ToPrimitive; - use super::{ EsVec, EsMap, EsEnum, EsVecLen, EsVecElt, EsMapLen, EsMapKey, - EsEnumVid, EsU64, EsU32, EsU16, EsU8, EsInt, EsI64, EsI32, EsI16, EsI8, - EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, EsEnumBody, EsUint, - EsOpaque, EsLabel, EbmlEncoderTag }; + use super::{ EsVec, EsMap, EsEnum, EsSub8, EsSub32, EsVecElt, EsMapKey, + EsU64, EsU32, EsU16, EsU8, EsI64, EsI32, EsI16, EsI8, + EsBool, EsF64, EsF32, EsChar, EsStr, EsMapVal, + EsOpaque, NUM_IMPLICIT_TAGS, NUM_TAGS }; + use super::io::SeekableMemWriter; use serialize; @@ -713,9 +855,24 @@ pub mod writer { pub type EncodeResult = old_io::IoResult<()>; // rbml writing - pub struct Encoder<'a, W:'a> { - pub writer: &'a mut W, + pub struct Encoder<'a> { + pub writer: &'a mut SeekableMemWriter, size_positions: Vec<uint>, + relax_limit: u64, // do not move encoded bytes before this position + } + + fn write_tag<W: Writer>(w: &mut W, n: uint) -> EncodeResult { + if n < 0xf0 { + w.write_all(&[n as u8]) + } else if 0x100 <= n && n < NUM_TAGS { + w.write_all(&[0xf0 | (n >> 8) as u8, n as u8]) + } else { + Err(old_io::IoError { + kind: old_io::OtherIoError, + desc: "invalid tag", + detail: Some(format!("{}", n)) + }) + } } fn write_sized_vuint<W: Writer>(w: &mut W, n: uint, size: uint) -> EncodeResult { @@ -746,27 +903,30 @@ pub mod writer { }) } - impl<'a, W: Writer + Seek> Encoder<'a, W> { - pub fn new(w: &'a mut W) -> Encoder<'a, W> { + impl<'a> Encoder<'a> { + pub fn new(w: &'a mut SeekableMemWriter) -> Encoder<'a> { Encoder { writer: w, size_positions: vec!(), + relax_limit: 0, } } /// FIXME(pcwalton): Workaround for badness in trans. DO NOT USE ME. - pub unsafe fn unsafe_clone(&self) -> Encoder<'a, W> { + pub unsafe fn unsafe_clone(&self) -> Encoder<'a> { Encoder { writer: mem::transmute_copy(&self.writer), size_positions: self.size_positions.clone(), + relax_limit: self.relax_limit, } } pub fn start_tag(&mut self, tag_id: uint) -> EncodeResult { debug!("Start tag {:?}", tag_id); + assert!(tag_id >= NUM_IMPLICIT_TAGS); // Write the enum ID: - try!(write_vuint(self.writer, tag_id)); + try!(write_tag(self.writer, tag_id)); // Write a placeholder four-byte size. self.size_positions.push(try!(self.writer.tell()) as uint); @@ -779,11 +939,29 @@ pub mod writer { let cur_pos = try!(self.writer.tell()); try!(self.writer.seek(last_size_pos as i64, old_io::SeekSet)); let size = cur_pos as uint - last_size_pos - 4; - try!(write_sized_vuint(self.writer, size, 4)); - let r = try!(self.writer.seek(cur_pos as i64, old_io::SeekSet)); + + // relax the size encoding for small tags (bigger tags are costly to move). + // we should never try to move the stable positions, however. + const RELAX_MAX_SIZE: uint = 0x100; + if size <= RELAX_MAX_SIZE && last_size_pos >= self.relax_limit as uint { + // we can't alter the buffer in place, so have a temporary buffer + let mut buf = [0u8; RELAX_MAX_SIZE]; + { + let data = &self.writer.get_ref()[last_size_pos+4..cur_pos as uint]; + bytes::copy_memory(&mut buf, data); + } + + // overwrite the size and data and continue + try!(write_vuint(self.writer, size)); + try!(self.writer.write_all(&buf[..size])); + } else { + // overwrite the size with an overlong encoding and skip past the data + try!(write_sized_vuint(self.writer, size, 4)); + try!(self.writer.seek(cur_pos as i64, old_io::SeekSet)); + } debug!("End tag (size = {:?})", size); - Ok(r) + Ok(()) } pub fn wr_tag<F>(&mut self, tag_id: uint, blk: F) -> EncodeResult where @@ -795,7 +973,8 @@ pub mod writer { } pub fn wr_tagged_bytes(&mut self, tag_id: uint, b: &[u8]) -> EncodeResult { - try!(write_vuint(self.writer, tag_id)); + assert!(tag_id >= NUM_IMPLICIT_TAGS); + try!(write_tag(self.writer, tag_id)); try!(write_vuint(self.writer, b.len())); self.writer.write_all(b) } @@ -839,6 +1018,47 @@ pub mod writer { self.wr_tagged_bytes(tag_id, v.as_bytes()) } + // for auto-serialization + fn wr_tagged_raw_bytes(&mut self, tag_id: uint, b: &[u8]) -> EncodeResult { + try!(write_tag(self.writer, tag_id)); + self.writer.write_all(b) + } + + fn wr_tagged_raw_u64(&mut self, tag_id: uint, v: u64) -> EncodeResult { + let bytes: [u8; 8] = unsafe { mem::transmute(v.to_be()) }; + self.wr_tagged_raw_bytes(tag_id, &bytes) + } + + fn wr_tagged_raw_u32(&mut self, tag_id: uint, v: u32) -> EncodeResult{ + let bytes: [u8; 4] = unsafe { mem::transmute(v.to_be()) }; + self.wr_tagged_raw_bytes(tag_id, &bytes) + } + + fn wr_tagged_raw_u16(&mut self, tag_id: uint, v: u16) -> EncodeResult { + let bytes: [u8; 2] = unsafe { mem::transmute(v.to_be()) }; + self.wr_tagged_raw_bytes(tag_id, &bytes) + } + + fn wr_tagged_raw_u8(&mut self, tag_id: uint, v: u8) -> EncodeResult { + self.wr_tagged_raw_bytes(tag_id, &[v]) + } + + fn wr_tagged_raw_i64(&mut self, tag_id: uint, v: i64) -> EncodeResult { + self.wr_tagged_raw_u64(tag_id, v as u64) + } + + fn wr_tagged_raw_i32(&mut self, tag_id: uint, v: i32) -> EncodeResult { + self.wr_tagged_raw_u32(tag_id, v as u32) + } + + fn wr_tagged_raw_i16(&mut self, tag_id: uint, v: i16) -> EncodeResult { + self.wr_tagged_raw_u16(tag_id, v as u16) + } + + fn wr_tagged_raw_i8(&mut self, tag_id: uint, v: i8) -> EncodeResult { + self.wr_tagged_raw_bytes(tag_id, &[v as u8]) + } + pub fn wr_bytes(&mut self, b: &[u8]) -> EncodeResult { debug!("Write {:?} bytes", b.len()); self.writer.write_all(b) @@ -848,38 +1068,36 @@ pub mod writer { debug!("Write str: {:?}", s); self.writer.write_all(s.as_bytes()) } - } - - // FIXME (#2743): optionally perform "relaxations" on end_tag to more - // efficiently encode sizes; this is a fixed point iteration - - // Set to true to generate more debugging in EBML code. - // Totally lame approach. - #[cfg(not(ndebug))] - static DEBUG: bool = true; - #[cfg(ndebug)] - static DEBUG: bool = false; - impl<'a, W: Writer + Seek> Encoder<'a, W> { - // used internally to emit things like the vector length and so on - fn _emit_tagged_uint(&mut self, t: EbmlEncoderTag, v: uint) -> EncodeResult { - assert!(v <= 0xFFFF_FFFF); - self.wr_tagged_u32(t as uint, v as u32) + /// Returns the current position while marking it stable, i.e. + /// generated bytes so far woundn't be affected by relaxation. + pub fn mark_stable_position(&mut self) -> u64 { + let pos = self.writer.tell().unwrap(); + if self.relax_limit < pos { + self.relax_limit = pos; + } + pos } + } - fn _emit_label(&mut self, label: &str) -> EncodeResult { - // There are various strings that we have access to, such as - // the name of a record field, which do not actually appear in - // the encoded EBML (normally). This is just for - // efficiency. When debugging, though, we can emit such - // labels and then they will be checked by decoder to - // try and check panics more quickly. - if DEBUG { self.wr_tagged_str(EsLabel as uint, label) } - else { Ok(()) } + impl<'a> Encoder<'a> { + // used internally to emit things like the vector length and so on + fn _emit_tagged_sub(&mut self, v: uint) -> EncodeResult { + if let Some(v) = v.to_u8() { + self.wr_tagged_raw_u8(EsSub8 as uint, v) + } else if let Some(v) = v.to_u32() { + self.wr_tagged_raw_u32(EsSub32 as uint, v) + } else { + Err(old_io::IoError { + kind: old_io::OtherIoError, + desc: "length or variant id too big", + detail: Some(format!("{}", v)) + }) + } } pub fn emit_opaque<F>(&mut self, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<W>) -> EncodeResult, + F: FnOnce(&mut Encoder) -> EncodeResult, { try!(self.start_tag(EsOpaque as uint)); try!(f(self)); @@ -887,7 +1105,7 @@ pub mod writer { } } - impl<'a, W: Writer + Seek> serialize::Encoder for Encoder<'a, W> { + impl<'a> serialize::Encoder for Encoder<'a> { type Error = old_io::IoError; fn emit_nil(&mut self) -> EncodeResult { @@ -895,61 +1113,78 @@ pub mod writer { } fn emit_uint(&mut self, v: uint) -> EncodeResult { - self.wr_tagged_u64(EsUint as uint, v as u64) + self.emit_u64(v as u64) } fn emit_u64(&mut self, v: u64) -> EncodeResult { - self.wr_tagged_u64(EsU64 as uint, v) + match v.to_u32() { + Some(v) => self.emit_u32(v), + None => self.wr_tagged_raw_u64(EsU64 as uint, v) + } } fn emit_u32(&mut self, v: u32) -> EncodeResult { - self.wr_tagged_u32(EsU32 as uint, v) + match v.to_u16() { + Some(v) => self.emit_u16(v), + None => self.wr_tagged_raw_u32(EsU32 as uint, v) + } } fn emit_u16(&mut self, v: u16) -> EncodeResult { - self.wr_tagged_u16(EsU16 as uint, v) + match v.to_u8() { + Some(v) => self.emit_u8(v), + None => self.wr_tagged_raw_u16(EsU16 as uint, v) + } } fn emit_u8(&mut self, v: u8) -> EncodeResult { - self.wr_tagged_u8(EsU8 as uint, v) + self.wr_tagged_raw_u8(EsU8 as uint, v) } fn emit_int(&mut self, v: int) -> EncodeResult { - self.wr_tagged_i64(EsInt as uint, v as i64) + self.emit_i64(v as i64) } fn emit_i64(&mut self, v: i64) -> EncodeResult { - self.wr_tagged_i64(EsI64 as uint, v) + match v.to_i32() { + Some(v) => self.emit_i32(v), + None => self.wr_tagged_raw_i64(EsI64 as uint, v) + } } fn emit_i32(&mut self, v: i32) -> EncodeResult { - self.wr_tagged_i32(EsI32 as uint, v) + match v.to_i16() { + Some(v) => self.emit_i16(v), + None => self.wr_tagged_raw_i32(EsI32 as uint, v) + } } fn emit_i16(&mut self, v: i16) -> EncodeResult { - self.wr_tagged_i16(EsI16 as uint, v) + match v.to_i8() { + Some(v) => self.emit_i8(v), + None => self.wr_tagged_raw_i16(EsI16 as uint, v) + } } fn emit_i8(&mut self, v: i8) -> EncodeResult { - self.wr_tagged_i8(EsI8 as uint, v) + self.wr_tagged_raw_i8(EsI8 as uint, v) } fn emit_bool(&mut self, v: bool) -> EncodeResult { - self.wr_tagged_u8(EsBool as uint, v as u8) + self.wr_tagged_raw_u8(EsBool as uint, v as u8) } fn emit_f64(&mut self, v: f64) -> EncodeResult { let bits = unsafe { mem::transmute(v) }; - self.wr_tagged_u64(EsF64 as uint, bits) + self.wr_tagged_raw_u64(EsF64 as uint, bits) } fn emit_f32(&mut self, v: f32) -> EncodeResult { let bits = unsafe { mem::transmute(v) }; - self.wr_tagged_u32(EsF32 as uint, bits) + self.wr_tagged_raw_u32(EsF32 as uint, bits) } fn emit_char(&mut self, v: char) -> EncodeResult { - self.wr_tagged_u32(EsChar as uint, v as u32) + self.wr_tagged_raw_u32(EsChar as uint, v as u32) } fn emit_str(&mut self, v: &str) -> EncodeResult { self.wr_tagged_str(EsStr as uint, v) } - fn emit_enum<F>(&mut self, name: &str, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + fn emit_enum<F>(&mut self, _name: &str, f: F) -> EncodeResult where + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { - try!(self._emit_label(name)); try!(self.start_tag(EsEnum as uint)); try!(f(self)); self.end_tag() @@ -960,16 +1195,14 @@ pub mod writer { v_id: uint, _: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { - try!(self._emit_tagged_uint(EsEnumVid, v_id)); - try!(self.start_tag(EsEnumBody as uint)); - try!(f(self)); - self.end_tag() + try!(self._emit_tagged_sub(v_id)); + f(self) } fn emit_enum_variant_arg<F>(&mut self, _: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { f(self) } @@ -979,7 +1212,7 @@ pub mod writer { v_id: uint, cnt: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_enum_variant(v_name, v_id, cnt, f) } @@ -988,48 +1221,47 @@ pub mod writer { _: &str, idx: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_enum_variant_arg(idx, f) } fn emit_struct<F>(&mut self, _: &str, _len: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { f(self) } - fn emit_struct_field<F>(&mut self, name: &str, _: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + fn emit_struct_field<F>(&mut self, _name: &str, _: uint, f: F) -> EncodeResult where + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { - try!(self._emit_label(name)); f(self) } fn emit_tuple<F>(&mut self, len: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_seq(len, f) } fn emit_tuple_arg<F>(&mut self, idx: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_seq_elt(idx, f) } fn emit_tuple_struct<F>(&mut self, _: &str, len: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_seq(len, f) } fn emit_tuple_struct_arg<F>(&mut self, idx: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_seq_elt(idx, f) } fn emit_option<F>(&mut self, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_enum("Option", f) } @@ -1037,24 +1269,28 @@ pub mod writer { self.emit_enum_variant("None", 0, 0, |_| Ok(())) } fn emit_option_some<F>(&mut self, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { self.emit_enum_variant("Some", 1, 1, f) } fn emit_seq<F>(&mut self, len: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { + if len == 0 { + // empty vector optimization + return self.wr_tagged_bytes(EsVec as uint, &[]); + } try!(self.start_tag(EsVec as uint)); - try!(self._emit_tagged_uint(EsVecLen, len)); + try!(self._emit_tagged_sub(len)); try!(f(self)); self.end_tag() } fn emit_seq_elt<F>(&mut self, _idx: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { try!(self.start_tag(EsVecElt as uint)); @@ -1063,17 +1299,21 @@ pub mod writer { } fn emit_map<F>(&mut self, len: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { + if len == 0 { + // empty map optimization + return self.wr_tagged_bytes(EsMap as uint, &[]); + } try!(self.start_tag(EsMap as uint)); - try!(self._emit_tagged_uint(EsMapLen, len)); + try!(self._emit_tagged_sub(len)); try!(f(self)); self.end_tag() } fn emit_map_elt_key<F>(&mut self, _idx: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { try!(self.start_tag(EsMapKey as uint)); @@ -1082,7 +1322,7 @@ pub mod writer { } fn emit_map_elt_val<F>(&mut self, _idx: uint, f: F) -> EncodeResult where - F: FnOnce(&mut Encoder<'a, W>) -> EncodeResult, + F: FnOnce(&mut Encoder<'a>) -> EncodeResult, { try!(self.start_tag(EsMapVal as uint)); try!(f(self)); |
