A Rust library for reading the OpenStreetMap PBF file format (*.osm.pbf).

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. //! Read and decode blobs
  2. extern crate byteorder;
  3. extern crate protobuf;
  4. use block::{HeaderBlock, PrimitiveBlock};
  5. use byteorder::ReadBytesExt;
  6. use error::{new_blob_error, new_protobuf_error, BlobError, Result};
  7. use proto::fileformat;
  8. use std::fs::File;
  9. use std::io::{BufReader, Read, Seek, SeekFrom};
  10. use std::path::Path;
  11. use util::{parse_message_from_bytes, parse_message_from_reader};
  12. #[cfg(feature = "system-libz")]
  13. use flate2::read::ZlibDecoder;
  14. #[cfg(not(feature = "system-libz"))]
  15. use inflate::DeflateDecoder;
  16. /// Maximum allowed `BlobHeader` size in bytes.
  17. pub static MAX_BLOB_HEADER_SIZE: u64 = 64 * 1024;
  18. /// Maximum allowed uncompressed `Blob` content size in bytes.
  19. pub static MAX_BLOB_MESSAGE_SIZE: u64 = 32 * 1024 * 1024;
  20. /// The content type of a blob.
  21. #[derive(Clone, Debug, Eq, PartialEq)]
  22. pub enum BlobType<'a> {
  23. /// Blob contains a `HeaderBlock`.
  24. OsmHeader,
  25. /// Blob contains a `PrimitiveBlock`.
  26. OsmData,
  27. /// An unknown blob type with the given string identifier.
  28. /// Parsers should ignore unknown blobs they do not expect.
  29. Unknown(&'a str),
  30. }
  31. //TODO rename variants to fit proto files
  32. /// The decoded content of a blob (analogous to `BlobType`).
  33. #[derive(Clone, Debug)]
  34. pub enum BlobDecode<'a> {
  35. /// Blob contains a `HeaderBlock`.
  36. OsmHeader(Box<HeaderBlock>),
  37. /// Blob contains a `PrimitiveBlock`.
  38. OsmData(PrimitiveBlock),
  39. /// An unknown blob type with the given string identifier.
  40. /// Parsers should ignore unknown blobs they do not expect.
  41. Unknown(&'a str),
  42. }
  43. /// The offset of a blob in bytes from stream start.
  44. #[derive(Clone, Copy, Debug, Eq, PartialEq)]
  45. pub struct ByteOffset(pub u64);
  46. /// A blob.
  47. ///
  48. /// A PBF file consists of a sequence of blobs. This type supports decoding the content of a blob
  49. /// to different types of blocks that are usually more interesting to the user.
  50. #[derive(Clone, Debug)]
  51. pub struct Blob {
  52. header: fileformat::BlobHeader,
  53. blob: fileformat::Blob,
  54. offset: Option<ByteOffset>,
  55. }
  56. impl Blob {
  57. fn new(
  58. header: fileformat::BlobHeader,
  59. blob: fileformat::Blob,
  60. offset: Option<ByteOffset>,
  61. ) -> Blob {
  62. Blob {
  63. header,
  64. blob,
  65. offset,
  66. }
  67. }
  68. /// Decodes the Blob and tries to obtain the inner content (usually a `HeaderBlock` or a
  69. /// `PrimitiveBlock`). This operation might involve an expensive decompression step.
  70. pub fn decode(&self) -> Result<BlobDecode> {
  71. match self.get_type() {
  72. BlobType::OsmHeader => {
  73. let block = Box::new(self.to_headerblock()?);
  74. Ok(BlobDecode::OsmHeader(block))
  75. }
  76. BlobType::OsmData => {
  77. let block = self.to_primitiveblock()?;
  78. Ok(BlobDecode::OsmData(block))
  79. }
  80. BlobType::Unknown(x) => Ok(BlobDecode::Unknown(x)),
  81. }
  82. }
  83. /// Returns the type of a blob without decoding its content.
  84. pub fn get_type(&self) -> BlobType {
  85. match self.header.get_field_type() {
  86. "OSMHeader" => BlobType::OsmHeader,
  87. "OSMData" => BlobType::OsmData,
  88. x => BlobType::Unknown(x),
  89. }
  90. }
  91. /// Returns the byte offset of the blob from the start of its source stream.
  92. /// This might be `None` if the source stream does not implement `Seek`.
  93. pub fn offset(&self) -> Option<ByteOffset> {
  94. self.offset
  95. }
  96. /// Tries to decode the blob to a `HeaderBlock`. This operation might involve an expensive
  97. /// decompression step.
  98. pub fn to_headerblock(&self) -> Result<HeaderBlock> {
  99. decode_blob(&self.blob).map(HeaderBlock::new)
  100. }
  101. /// Tries to decode the blob to a `PrimitiveBlock`. This operation might involve an expensive
  102. /// decompression step.
  103. pub fn to_primitiveblock(&self) -> Result<PrimitiveBlock> {
  104. decode_blob(&self.blob).map(PrimitiveBlock::new)
  105. }
  106. }
  107. /// A blob header.
  108. ///
  109. /// Just contains information about the size and type of the following `Blob`.
  110. #[derive(Clone, Debug)]
  111. pub struct BlobHeader {
  112. header: fileformat::BlobHeader,
  113. }
  114. impl BlobHeader {
  115. fn new(header: fileformat::BlobHeader) -> Self {
  116. BlobHeader { header }
  117. }
  118. /// Returns the type of the following blob.
  119. pub fn blob_type(&self) -> BlobType {
  120. match self.header.get_field_type() {
  121. "OSMHeader" => BlobType::OsmHeader,
  122. "OSMData" => BlobType::OsmData,
  123. x => BlobType::Unknown(x),
  124. }
  125. }
  126. /// Returns the size of the following blob in bytes.
  127. pub fn get_blob_size(&self) -> i32 {
  128. self.header.get_datasize()
  129. }
  130. }
  131. /// A reader for PBF files that allows iterating over `Blob`s.
  132. #[derive(Clone, Debug)]
  133. pub struct BlobReader<R: Read + Send> {
  134. reader: R,
  135. /// Current reader offset in bytes from the start of the stream.
  136. offset: Option<ByteOffset>,
  137. last_blob_ok: bool,
  138. }
  139. impl<R: Read + Send> BlobReader<R> {
  140. /// Creates a new `BlobReader`.
  141. ///
  142. /// # Example
  143. /// ```
  144. /// use osmpbf::*;
  145. ///
  146. /// # fn foo() -> Result<()> {
  147. /// let f = std::fs::File::open("tests/test.osm.pbf")?;
  148. /// let buf_reader = std::io::BufReader::new(f);
  149. ///
  150. /// let reader = BlobReader::new(buf_reader);
  151. ///
  152. /// # Ok(())
  153. /// # }
  154. /// # foo().unwrap();
  155. /// ```
  156. pub fn new(reader: R) -> BlobReader<R> {
  157. BlobReader {
  158. reader,
  159. offset: None,
  160. last_blob_ok: true,
  161. }
  162. }
  163. fn read_blob_header(&mut self) -> Option<Result<fileformat::BlobHeader>> {
  164. let header_size: u64 = match self.reader.read_u32::<byteorder::BigEndian>() {
  165. Ok(n) => {
  166. self.offset = self.offset.map(|x| ByteOffset(x.0 + 4));
  167. u64::from(n)
  168. }
  169. Err(e) => {
  170. self.offset = None;
  171. match e.kind() {
  172. ::std::io::ErrorKind::UnexpectedEof => {
  173. //TODO This also accepts corrupted files in the case of 1-3 available bytes
  174. return None;
  175. }
  176. _ => {
  177. self.last_blob_ok = false;
  178. return Some(Err(new_blob_error(BlobError::InvalidHeaderSize)));
  179. }
  180. }
  181. }
  182. };
  183. if header_size >= MAX_BLOB_HEADER_SIZE {
  184. self.last_blob_ok = false;
  185. return Some(Err(new_blob_error(BlobError::HeaderTooBig {
  186. size: header_size,
  187. })));
  188. }
  189. let header: fileformat::BlobHeader =
  190. match parse_message_from_reader(&mut self.reader.by_ref().take(header_size)) {
  191. Ok(header) => header,
  192. Err(e) => {
  193. self.offset = None;
  194. self.last_blob_ok = false;
  195. return Some(Err(new_protobuf_error(e, "blob header")));
  196. }
  197. };
  198. self.offset = self.offset.map(|x| ByteOffset(x.0 + header_size));
  199. Some(Ok(header))
  200. }
  201. }
  202. impl BlobReader<BufReader<File>> {
  203. /// Tries to open the file at the given path and constructs a `BlobReader` from this.
  204. ///
  205. /// # Errors
  206. /// Returns the same errors that `std::fs::File::open` returns.
  207. ///
  208. /// # Example
  209. /// ```
  210. /// use osmpbf::*;
  211. ///
  212. /// # fn foo() -> Result<()> {
  213. /// let reader = BlobReader::from_path("tests/test.osm.pbf")?;
  214. /// # Ok(())
  215. /// # }
  216. /// # foo().unwrap();
  217. /// ```
  218. pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
  219. let f = File::open(path)?;
  220. let reader = BufReader::new(f);
  221. Ok(BlobReader {
  222. reader,
  223. offset: Some(ByteOffset(0)),
  224. last_blob_ok: true,
  225. })
  226. }
  227. }
  228. impl<R: Read + Send> Iterator for BlobReader<R> {
  229. type Item = Result<Blob>;
  230. fn next(&mut self) -> Option<Self::Item> {
  231. // Stop iteration if there was an error.
  232. if !self.last_blob_ok {
  233. return None;
  234. }
  235. let prev_offset = self.offset;
  236. let header = match self.read_blob_header() {
  237. Some(Ok(header)) => header,
  238. Some(Err(err)) => return Some(Err(err)),
  239. None => return None,
  240. };
  241. let blob: fileformat::Blob = match parse_message_from_reader(
  242. &mut self.reader.by_ref().take(header.get_datasize() as u64),
  243. ) {
  244. Ok(blob) => blob,
  245. Err(e) => {
  246. self.offset = None;
  247. self.last_blob_ok = false;
  248. return Some(Err(new_protobuf_error(e, "blob content")));
  249. }
  250. };
  251. self.offset = self
  252. .offset
  253. .map(|x| ByteOffset(x.0 + header.get_datasize() as u64));
  254. Some(Ok(Blob::new(header, blob, prev_offset)))
  255. }
  256. }
  257. impl<R: Read + Seek + Send> BlobReader<R> {
  258. /// Creates a new `BlobReader` from the given reader that is seekable and will be initialized
  259. /// with a valid offset.
  260. ///
  261. /// # Example
  262. /// ```
  263. /// use osmpbf::*;
  264. ///
  265. /// # fn foo() -> Result<()> {
  266. /// let f = std::fs::File::open("tests/test.osm.pbf")?;
  267. /// let buf_reader = std::io::BufReader::new(f);
  268. ///
  269. /// let mut reader = BlobReader::new_seekable(buf_reader)?;
  270. /// let first_blob = reader.next().unwrap()?;
  271. ///
  272. /// assert_eq!(first_blob.offset(), Some(ByteOffset(0)));
  273. /// # Ok(())
  274. /// # }
  275. /// # foo().unwrap();
  276. /// ```
  277. pub fn new_seekable(mut reader: R) -> Result<BlobReader<R>> {
  278. let pos = reader.seek(SeekFrom::Current(0))?;
  279. Ok(BlobReader {
  280. reader,
  281. offset: Some(ByteOffset(pos)),
  282. last_blob_ok: true,
  283. })
  284. }
  285. /// Seek to an offset in bytes from the start of the stream.
  286. ///
  287. /// # Example
  288. /// ```
  289. /// use osmpbf::*;
  290. ///
  291. /// # fn foo() -> Result<()> {
  292. /// let mut reader = BlobReader::from_path("tests/test.osm.pbf")?;
  293. /// let first_blob = reader.next().unwrap()?;
  294. /// let second_blob = reader.next().unwrap()?;
  295. ///
  296. /// reader.seek(first_blob.offset().unwrap())?;
  297. ///
  298. /// let first_blob_again = reader.next().unwrap()?;
  299. /// assert_eq!(first_blob.offset(), first_blob_again.offset());
  300. /// # Ok(())
  301. /// # }
  302. /// # foo().unwrap();
  303. /// ```
  304. pub fn seek(&mut self, pos: ByteOffset) -> Result<()> {
  305. match self.reader.seek(SeekFrom::Start(pos.0)) {
  306. Ok(offset) => {
  307. self.offset = Some(ByteOffset(offset));
  308. Ok(())
  309. }
  310. Err(e) => {
  311. self.offset = None;
  312. Err(e.into())
  313. }
  314. }
  315. }
  316. /// Seek to an offset in bytes. (See `std::io::Seek`)
  317. pub fn seek_raw(&mut self, pos: SeekFrom) -> Result<u64> {
  318. match self.reader.seek(pos) {
  319. Ok(offset) => {
  320. self.offset = Some(ByteOffset(offset));
  321. Ok(offset)
  322. }
  323. Err(e) => {
  324. self.offset = None;
  325. Err(e.into())
  326. }
  327. }
  328. }
  329. /// Read and return next `BlobHeader` but skip the following `Blob`. This allows really fast
  330. /// iteration of the PBF structure if only the byte offset and `BlobType` are important.
  331. /// On success, returns the `BlobHeader` and the byte offset of the header which can also be
  332. /// used as an offset for reading the entire `Blob` (including header).
  333. pub fn next_header_skip_blob(&mut self) -> Option<Result<(BlobHeader, Option<ByteOffset>)>> {
  334. // Stop iteration if there was an error.
  335. if !self.last_blob_ok {
  336. return None;
  337. }
  338. let prev_offset = self.offset;
  339. // read header
  340. let header = match self.read_blob_header() {
  341. Some(Ok(header)) => header,
  342. Some(Err(err)) => return Some(Err(err)),
  343. None => return None,
  344. };
  345. // skip blob (which also adjusts self.offset)
  346. if let Err(err) = self.seek_raw(SeekFrom::Current(header.get_datasize() as i64)) {
  347. self.last_blob_ok = false;
  348. return Some(Err(err));
  349. }
  350. Some(Ok((BlobHeader::new(header), prev_offset)))
  351. }
  352. }
  353. impl BlobReader<BufReader<File>> {
  354. /// Creates a new `BlobReader` from the given path that is seekable and will be initialized
  355. /// with a valid offset.
  356. ///
  357. /// # Example
  358. /// ```
  359. /// use osmpbf::*;
  360. ///
  361. /// # fn foo() -> Result<()> {
  362. /// let mut reader = BlobReader::seekable_from_path("tests/test.osm.pbf")?;
  363. /// let first_blob = reader.next().unwrap()?;
  364. ///
  365. /// assert_eq!(first_blob.offset(), Some(ByteOffset(0)));
  366. /// # Ok(())
  367. /// # }
  368. /// # foo().unwrap();
  369. /// ```
  370. pub fn seekable_from_path<P: AsRef<Path>>(path: P) -> Result<BlobReader<BufReader<File>>> {
  371. let f = File::open(path.as_ref())?;
  372. let buf_reader = BufReader::new(f);
  373. Self::new_seekable(buf_reader)
  374. }
  375. }
  376. #[cfg(feature = "system-libz")]
  377. pub(crate) fn decode_blob<T>(blob: &fileformat::Blob) -> Result<T>
  378. where
  379. T: protobuf::Message,
  380. {
  381. if blob.has_raw() {
  382. let size = blob.get_raw().len() as u64;
  383. if size < MAX_BLOB_MESSAGE_SIZE {
  384. parse_message_from_bytes(blob.get_raw())
  385. .map_err(|e| new_protobuf_error(e, "raw blob data"))
  386. } else {
  387. Err(new_blob_error(BlobError::MessageTooBig { size }))
  388. }
  389. } else if blob.has_zlib_data() {
  390. let mut decoder = ZlibDecoder::new(blob.get_zlib_data()).take(MAX_BLOB_MESSAGE_SIZE);
  391. parse_message_from_reader(&mut decoder).map_err(|e| new_protobuf_error(e, "blob zlib data"))
  392. } else {
  393. Err(new_blob_error(BlobError::Empty))
  394. }
  395. }
  396. #[cfg(not(feature = "system-libz"))]
  397. pub(crate) fn decode_blob<T>(blob: &fileformat::Blob) -> Result<T>
  398. where
  399. T: protobuf::Message,
  400. {
  401. if blob.has_raw() {
  402. let size = blob.get_raw().len() as u64;
  403. if size < MAX_BLOB_MESSAGE_SIZE {
  404. parse_message_from_bytes(blob.get_raw())
  405. .map_err(|e| new_protobuf_error(e, "raw blob data"))
  406. } else {
  407. Err(new_blob_error(BlobError::MessageTooBig { size }))
  408. }
  409. } else if blob.has_zlib_data() {
  410. let mut decoder =
  411. DeflateDecoder::from_zlib(blob.get_zlib_data()).take(MAX_BLOB_MESSAGE_SIZE);
  412. parse_message_from_reader(&mut decoder).map_err(|e| new_protobuf_error(e, "blob zlib data"))
  413. } else {
  414. Err(new_blob_error(BlobError::Empty))
  415. }
  416. }