A Rust library for reading the OpenStreetMap PBF file format (*.osm.pbf).

indexed.rs 10.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. //! Speed up searches by using an index
  2. use error::Result;
  3. use std::collections::BTreeSet;
  4. use std::fs::File;
  5. use std::io::{Read, Seek};
  6. use std::ops::RangeInclusive;
  7. use std::path::Path;
  8. use {BlobReader, BlobType, ByteOffset, Element, Way};
  9. /// Stores the minimum and maximum id of every element type.
  10. #[derive(Debug)]
  11. pub struct IdRanges {
  12. node_ids: Option<RangeInclusive<i64>>,
  13. way_ids: Option<RangeInclusive<i64>>,
  14. relation_ids: Option<RangeInclusive<i64>>,
  15. }
  16. /// Returns true if the given set contains at least one value that is inside the given range.
  17. fn range_included(range: RangeInclusive<i64>, node_ids: &BTreeSet<i64>) -> bool {
  18. node_ids.range(range).next().is_some()
  19. }
  20. #[derive(Clone, Copy, Debug, Eq, PartialEq)]
  21. enum SimpleBlobType {
  22. Header,
  23. Primitive,
  24. Unknown,
  25. }
  26. #[derive(Debug)]
  27. struct BlobInfo {
  28. offset: ByteOffset,
  29. blob_type: SimpleBlobType,
  30. id_ranges: Option<IdRanges>,
  31. }
  32. /// Allows filtering elements and iterating over their dependencies.
  33. /// It chooses an efficient method for navigating the PBF structure to achieve this in reasonable
  34. /// time and with reasonable memory.
  35. pub struct IndexedReader<R: Read + Seek + Send> {
  36. reader: BlobReader<R>,
  37. index: Vec<BlobInfo>,
  38. }
  39. impl<R: Read + Seek + Send> IndexedReader<R> {
  40. /// Creates a new `IndexedReader`.
  41. ///
  42. /// # Example
  43. /// ```
  44. /// use osmpbf::*;
  45. ///
  46. /// # fn foo() -> Result<()> {
  47. /// let f = std::fs::File::open("tests/test.osm.pbf")?;
  48. /// let buf_reader = std::io::BufReader::new(f);
  49. ///
  50. /// let reader = IndexedReader::new(buf_reader)?;
  51. ///
  52. /// # Ok(())
  53. /// # }
  54. /// # foo().unwrap();
  55. /// ```
  56. pub fn new(reader: R) -> Result<Self> {
  57. let reader = BlobReader::new_seekable(reader)?;
  58. Ok(Self {
  59. reader,
  60. index: vec![],
  61. })
  62. }
  63. pub fn create_index(&mut self) -> Result<()> {
  64. // remove old items
  65. self.index.clear();
  66. while let Some(result) = self.reader.next_header_skip_blob() {
  67. let (header, offset) = result?;
  68. // Reader is seekable, so offset should be Some(ByteOffset)
  69. let offset = offset.unwrap();
  70. let blob_type = match header.blob_type() {
  71. BlobType::OsmHeader => SimpleBlobType::Header,
  72. BlobType::OsmData => SimpleBlobType::Primitive,
  73. BlobType::Unknown(_) => SimpleBlobType::Unknown,
  74. };
  75. self.index.push(BlobInfo {
  76. offset,
  77. blob_type,
  78. id_ranges: None,
  79. });
  80. }
  81. Ok(())
  82. }
  83. /// Filter ways using a closure and return matching ways and their dependent nodes (`Node`s and
  84. /// `DenseNode`s) in another closure.
  85. ///
  86. /// # Example
  87. /// ```
  88. /// use osmpbf::*;
  89. ///
  90. /// # fn foo() -> Result<()> {
  91. /// let mut reader = IndexedReader::from_path("tests/test.osm.pbf")?;
  92. /// let mut ways = 0;
  93. /// let mut nodes = 0;
  94. ///
  95. /// // Filter all ways that are buildings and count their nodes.
  96. /// reader.read_ways_and_deps(
  97. /// |way| {
  98. /// // Filter ways. Return true if tags contain "building": "yes".
  99. /// way.tags().any(|key_value| key_value == ("building", "yes"))
  100. /// },
  101. /// |element| {
  102. /// // Increment counter
  103. /// match element {
  104. /// Element::Way(way) => ways += 1,
  105. /// Element::Node(node) => nodes += 1,
  106. /// Element::DenseNode(dense_node) => nodes += 1,
  107. /// Element::Relation(_) => (), // should not occur
  108. /// }
  109. /// },
  110. /// )?;
  111. ///
  112. /// println!("ways: {}\nnodes: {}", ways, nodes);
  113. ///
  114. /// # assert_eq!(ways, 1);
  115. /// # assert_eq!(nodes, 3);
  116. /// # Ok(())
  117. /// # }
  118. /// # foo().unwrap();
  119. /// ```
  120. pub fn read_ways_and_deps<F, E>(
  121. &mut self,
  122. mut filter: F,
  123. mut element_callback: E,
  124. ) -> Result<()>
  125. where
  126. F: for<'a> FnMut(&Way<'a>) -> bool,
  127. E: for<'a> FnMut(&Element<'a>),
  128. {
  129. // Create index
  130. if self.index.is_empty() {
  131. self.create_index()?;
  132. }
  133. let mut node_ids: BTreeSet<i64> = BTreeSet::new();
  134. // First pass:
  135. // * Filter ways and store their dependencies as node IDs
  136. // * Store range of node IDs (min and max value) of each block
  137. for info in &mut self.index {
  138. //TODO do something useful with header blocks
  139. if info.blob_type == SimpleBlobType::Primitive {
  140. self.reader.seek(info.offset)?;
  141. let blob = self.reader.next().ok_or_else(|| {
  142. ::std::io::Error::new(
  143. ::std::io::ErrorKind::UnexpectedEof,
  144. "could not read next blob",
  145. )
  146. })??;
  147. let block = blob.to_primitiveblock()?;
  148. let mut min_node_id: Option<i64> = None;
  149. let mut max_node_id: Option<i64> = None;
  150. for group in block.groups() {
  151. // filter ways and record node IDs
  152. for way in group.ways() {
  153. if filter(&way) {
  154. let refs = way.refs();
  155. node_ids.extend(refs);
  156. // Return way
  157. element_callback(&Element::Way(way));
  158. }
  159. }
  160. // Check node IDs of this block, record min and max
  161. let mut check_min_max = |id| {
  162. min_node_id = Some(min_node_id.map_or(id, |x| x.min(id)));
  163. max_node_id = Some(max_node_id.map_or(id, |x| x.max(id)));
  164. };
  165. for node in group.nodes() {
  166. check_min_max(node.id())
  167. }
  168. for node in group.dense_nodes() {
  169. check_min_max(node.id)
  170. }
  171. }
  172. if let (Some(min), Some(max)) = (min_node_id, max_node_id) {
  173. info.id_ranges = Some(IdRanges {
  174. node_ids: Some(RangeInclusive::new(min, max)),
  175. way_ids: None,
  176. relation_ids: None,
  177. });
  178. }
  179. }
  180. }
  181. // Second pass:
  182. // * Iterate only over blobs that may include the node IDs we're searching for
  183. for info in &mut self.index {
  184. if info.blob_type == SimpleBlobType::Primitive {
  185. if let Some(node_id_range) = info.id_ranges.as_ref().and_then(|r| r.node_ids.as_ref()) {
  186. if range_included(node_id_range.clone(), &node_ids) {
  187. //TODO Only collect into Vec if range has a reasonable size
  188. let node_ids: Vec<i64> = node_ids.range(node_id_range.clone()).map(|x| *x).collect();
  189. self.reader.seek(info.offset)?;
  190. let blob = self.reader.next().ok_or_else(|| {
  191. ::std::io::Error::new(
  192. ::std::io::ErrorKind::UnexpectedEof,
  193. "could not read next blob",
  194. )
  195. })??;
  196. let block = blob.to_primitiveblock()?;
  197. for group in block.groups() {
  198. for node in group.nodes() {
  199. if node_ids.binary_search(&node.id()).is_ok() {
  200. // ID found, return node
  201. element_callback(&Element::Node(node));
  202. }
  203. }
  204. for node in group.dense_nodes() {
  205. if node_ids.binary_search(&node.id).is_ok() {
  206. // ID found, return dense node
  207. element_callback(&Element::DenseNode(node));
  208. }
  209. }
  210. }
  211. }
  212. }
  213. }
  214. }
  215. Ok(())
  216. }
  217. }
  218. impl IndexedReader<File> {
  219. /// Creates a new `IndexedReader` from a given path.
  220. ///
  221. /// # Example
  222. /// ```
  223. /// use osmpbf::*;
  224. ///
  225. /// # fn foo() -> Result<()> {
  226. /// let reader = IndexedReader::from_path("tests/test.osm.pbf")?;
  227. ///
  228. /// # Ok(())
  229. /// # }
  230. /// # foo().unwrap();
  231. /// ```
  232. pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
  233. //TODO take some more measurements to determine if `BufReader` should be used here
  234. let f = File::open(path)?;
  235. Self::new(f)
  236. }
  237. }
  238. #[cfg(test)]
  239. mod tests {
  240. use super::*;
  241. #[test]
  242. fn test_range_included_set() {
  243. let mut set = BTreeSet::<i64>::new();
  244. set.extend(&[1,2,6]);
  245. assert_eq!(range_included(RangeInclusive::new(0, 0), &set), false);
  246. assert_eq!(range_included(RangeInclusive::new(1, 1), &set), true);
  247. assert_eq!(range_included(RangeInclusive::new(2, 2), &set), true);
  248. assert_eq!(range_included(RangeInclusive::new(3, 3), &set), false);
  249. assert_eq!(range_included(RangeInclusive::new(3, 5), &set), false);
  250. assert_eq!(range_included(RangeInclusive::new(3, 6), &set), true);
  251. assert_eq!(range_included(RangeInclusive::new(6, 6), &set), true);
  252. assert_eq!(range_included(RangeInclusive::new(7, 7), &set), false);
  253. assert_eq!(range_included(RangeInclusive::new(0, 1), &set), true);
  254. assert_eq!(range_included(RangeInclusive::new(6, 7), &set), true);
  255. assert_eq!(range_included(RangeInclusive::new(2, 3), &set), true);
  256. assert_eq!(range_included(RangeInclusive::new(5, 6), &set), true);
  257. assert_eq!(range_included(RangeInclusive::new(5, 8), &set), true);
  258. assert_eq!(range_included(RangeInclusive::new(0, 8), &set), true);
  259. assert_eq!(range_included(RangeInclusive::new(0, 4), &set), true);
  260. }
  261. }