Преглед на файлове

Add IndexedReader

This allows filtering elements and finding their dependencies by
decoding as few blobs as possible.
Johannes Hofmann преди 5 години
родител
ревизия
e5fd662654
променени са 3 файла, в които са добавени 336 реда и са изтрити 0 реда
  1. 305
    0
      src/indexed.rs
  2. 2
    0
      src/lib.rs
  3. 29
    0
      tests/read.rs

+ 305
- 0
src/indexed.rs Целия файл

@@ -0,0 +1,305 @@
1
+//! Speed up searches by using an index
2
+
3
+use error::Result;
4
+use std::fs::File;
5
+use std::io::{Read, Seek};
6
+use std::ops::RangeInclusive;
7
+use std::path::Path;
8
+use {BlobReader, BlobType, ByteOffset, Element, Way};
9
+
10
+/// Stores the minimum and maximum id of every element type.
11
+#[derive(Debug)]
12
+pub struct IdRanges {
13
+    node_ids: Option<RangeInclusive<i64>>,
14
+    way_ids: Option<RangeInclusive<i64>>,
15
+    relation_ids: Option<RangeInclusive<i64>>,
16
+}
17
+
18
+/// Checks if `sorted_slice` contains some values from the given `range`.
19
+/// Assumes that `sorted_slice` is sorted.
20
+/// Returns the range of indices into `sorted_slice` that needs to be checked.
21
+/// Returns `None` if it is guaranteed that no values from `sorted_slice` are inside `range`.
22
+fn range_included(range: &RangeInclusive<i64>, sorted_slice: &[i64]) -> Option<RangeInclusive<usize>> {
23
+    match (sorted_slice.binary_search(&range.start()), sorted_slice.binary_search(&range.end())) {
24
+        (Ok(start), Ok(end)) => Some(RangeInclusive::new(start, end)),
25
+        (Ok(start), Err(end)) => Some(RangeInclusive::new(start, end.saturating_sub(1))),
26
+        (Err(start), Ok(end)) => Some(RangeInclusive::new(start, end)),
27
+        (Err(start), Err(end)) => {
28
+            if start == end {
29
+                None
30
+            } else {
31
+                Some(RangeInclusive::new(start, end.saturating_sub(1)))
32
+            }
33
+        },
34
+    }
35
+}
36
+
37
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
38
+enum SimpleBlobType {
39
+    Header,
40
+    Primitive,
41
+    Unknown,
42
+}
43
+
44
+#[derive(Debug)]
45
+struct BlobInfo {
46
+    offset: ByteOffset,
47
+    blob_type: SimpleBlobType,
48
+    id_ranges: Option<IdRanges>,
49
+}
50
+
51
+/// Allows filtering elements and iterating over their dependencies.
52
+/// It chooses an efficient method for navigating the PBF structure to achieve this in reasonable
53
+/// time and with reasonable memory.
54
+pub struct IndexedReader<R: Read + Seek> {
55
+    reader: BlobReader<R>,
56
+    index: Vec<BlobInfo>,
57
+}
58
+
59
+impl<R: Read + Seek> IndexedReader<R> {
60
+    /// Creates a new `IndexedReader`.
61
+    ///
62
+    /// # Example
63
+    /// ```
64
+    /// use osmpbf::*;
65
+    ///
66
+    /// # fn foo() -> Result<()> {
67
+    /// let f = std::fs::File::open("tests/test.osm.pbf")?;
68
+    /// let buf_reader = std::io::BufReader::new(f);
69
+    ///
70
+    /// let reader = IndexedReader::new(buf_reader)?;
71
+    ///
72
+    /// # Ok(())
73
+    /// # }
74
+    /// # foo().unwrap();
75
+    /// ```
76
+    pub fn new(reader: R) -> Result<Self> {
77
+        let reader = BlobReader::new_seekable(reader)?;
78
+        Ok(Self {
79
+            reader,
80
+            index: vec![],
81
+        })
82
+    }
83
+
84
+    fn create_index(&mut self) -> Result<()> {
85
+        // remove old items
86
+        self.index.clear();
87
+
88
+        for blob in &mut self.reader {
89
+            let blob = blob?;
90
+            // Reader is seekable, so offset should return Some(ByteOffset)
91
+            let offset = blob.offset().unwrap();
92
+            let blob_type = match blob.get_type() {
93
+                BlobType::OsmHeader => SimpleBlobType::Header,
94
+                BlobType::OsmData => SimpleBlobType::Primitive,
95
+                BlobType::Unknown(_) => SimpleBlobType::Unknown,
96
+            };
97
+
98
+            self.index.push(BlobInfo {
99
+                offset,
100
+                blob_type,
101
+                id_ranges: None,
102
+            });
103
+        }
104
+
105
+        Ok(())
106
+    }
107
+
108
+    /// Filter ways using a closure and return matching ways and their dependent nodes (`Node`s and
109
+    /// `DenseNode`s) in another closure.
110
+    ///
111
+    /// # Example
112
+    /// ```
113
+    /// use osmpbf::*;
114
+    ///
115
+    /// # fn foo() -> Result<()> {
116
+    /// let mut reader = IndexedReader::from_path("tests/test.osm.pbf")?;
117
+    /// let mut ways = 0;
118
+    /// let mut nodes = 0;
119
+    ///
120
+    /// // Filter all ways that are buildings and count their nodes.
121
+    /// reader.read_ways_and_deps(
122
+    ///     |way| {
123
+    ///         // Filter ways. Return true if tags contain "building": "yes".
124
+    ///         way.tags().any(|key_value| key_value == ("building", "yes"))
125
+    ///     },
126
+    ///     |element| {
127
+    ///         // Increment counter
128
+    ///         match element {
129
+    ///             Element::Way(way) => ways += 1,
130
+    ///             Element::Node(node) => nodes += 1,
131
+    ///             Element::DenseNode(dense_node) => nodes += 1,
132
+    ///             Element::Relation(_) => (), // should not occur
133
+    ///         }
134
+    ///     },
135
+    /// )?;
136
+    ///
137
+    /// println!("ways:  {}\nnodes: {}", ways, nodes);
138
+    ///
139
+    /// # assert_eq!(ways, 1);
140
+    /// # assert_eq!(nodes, 3);
141
+    /// # Ok(())
142
+    /// # }
143
+    /// # foo().unwrap();
144
+    /// ```
145
+    pub fn read_ways_and_deps<F, E>(
146
+        &mut self,
147
+        mut filter: F,
148
+        mut element_callback: E,
149
+    ) -> Result<()>
150
+    where
151
+        F: for<'a> FnMut(&Way<'a>) -> bool,
152
+        E: for<'a> FnMut(&Element<'a>),
153
+    {
154
+        // Create index
155
+        if self.index.is_empty() {
156
+            self.create_index()?;
157
+        }
158
+
159
+        let mut node_ids: Vec<i64> = vec![];
160
+
161
+        // First pass:
162
+        //   * Filter ways and store their dependencies as node IDs
163
+        //   * Store range of node IDs (min and max value) of each block
164
+        for info in &mut self.index {
165
+            //TODO do something useful with header blocks
166
+            if info.blob_type == SimpleBlobType::Primitive {
167
+                self.reader.seek(info.offset)?;
168
+                let blob = self.reader.next().ok_or_else(|| {
169
+                    ::std::io::Error::new(
170
+                        ::std::io::ErrorKind::UnexpectedEof,
171
+                        "could not read next blob",
172
+                    )
173
+                })??;
174
+                let block = blob.to_primitiveblock()?;
175
+                let mut min_node_id: Option<i64> = None;
176
+                let mut max_node_id: Option<i64> = None;
177
+                for group in block.groups() {
178
+                    // filter ways and record node IDs
179
+                    for way in group.ways() {
180
+                        if filter(&way) {
181
+                            let refs = way.refs();
182
+
183
+                            node_ids.reserve(refs.size_hint().0);
184
+                            for node_id in refs {
185
+                                node_ids.push(node_id);
186
+                            }
187
+
188
+                            // Return way
189
+                            element_callback(&Element::Way(way));
190
+                        }
191
+                    }
192
+
193
+                    // Check node IDs of this block, record min and max
194
+
195
+                    let mut check_min_max = |id| {
196
+                        min_node_id = Some(min_node_id.map_or(id, |x| x.min(id)));
197
+                        max_node_id = Some(max_node_id.map_or(id, |x| x.max(id)));
198
+                    };
199
+
200
+                    for node in group.nodes() {
201
+                        check_min_max(node.id())
202
+                    }
203
+                    for node in group.dense_nodes() {
204
+                        check_min_max(node.id)
205
+                    }
206
+                }
207
+                if let (Some(min), Some(max)) = (min_node_id, max_node_id) {
208
+                    info.id_ranges = Some(IdRanges {
209
+                        node_ids: Some(RangeInclusive::new(min, max)),
210
+                        way_ids: None,
211
+                        relation_ids: None,
212
+                    });
213
+                }
214
+            }
215
+        }
216
+
217
+        // Sort, to enable binary search
218
+        node_ids.sort_unstable();
219
+
220
+        // Remove duplicate node IDs
221
+        node_ids.dedup();
222
+
223
+        // Second pass:
224
+        //   * Iterate only over blobs that may include the node IDs we're searching for
225
+        for info in &mut self.index {
226
+            if info.blob_type == SimpleBlobType::Primitive {
227
+                if let Some(node_id_range) = info.id_ranges.as_ref().and_then(|r| r.node_ids.as_ref()) {
228
+                    if let Some(slice_range) = range_included(node_id_range, &node_ids) {
229
+                        let ids_subslice = &node_ids.as_slice()[slice_range];
230
+
231
+                        self.reader.seek(info.offset)?;
232
+                        let blob = self.reader.next().ok_or_else(|| {
233
+                            ::std::io::Error::new(
234
+                                ::std::io::ErrorKind::UnexpectedEof,
235
+                                "could not read next blob",
236
+                            )
237
+                        })??;
238
+                        let block = blob.to_primitiveblock()?;
239
+                        for group in block.groups() {
240
+                            for node in group.nodes() {
241
+                                let id = node.id();
242
+                                if ids_subslice.binary_search(&id).is_ok() {
243
+                                    // ID found, return node
244
+                                    element_callback(&Element::Node(node));
245
+                                }
246
+                            }
247
+                            for node in group.dense_nodes() {
248
+                                let id = node.id;
249
+                                if ids_subslice.binary_search(&id).is_ok() {
250
+                                    // ID found, return dense node
251
+                                    element_callback(&Element::DenseNode(node));
252
+                                }
253
+                            }
254
+                        }
255
+                    }
256
+                }
257
+            }
258
+        }
259
+
260
+        Ok(())
261
+    }
262
+}
263
+
264
+impl IndexedReader<File> {
265
+    /// Creates a new `IndexedReader` from a given path.
266
+    ///
267
+    /// # Example
268
+    /// ```
269
+    /// use osmpbf::*;
270
+    ///
271
+    /// # fn foo() -> Result<()> {
272
+    /// let reader = IndexedReader::from_path("tests/test.osm.pbf")?;
273
+    ///
274
+    /// # Ok(())
275
+    /// # }
276
+    /// # foo().unwrap();
277
+    /// ```
278
+    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
279
+        //TODO take some more measurements to determine if `BufReader` should be used here
280
+        let f = File::open(path)?;
281
+        Self::new(f)
282
+    }
283
+}
284
+
285
+#[cfg(test)]
286
+mod tests {
287
+    use super::*;
288
+
289
+    #[test]
290
+    fn test_range_included() {
291
+        assert_eq!(range_included(&RangeInclusive::new(0, 0), &[1,2,3]), None);
292
+        assert_eq!(range_included(&RangeInclusive::new(1, 1), &[1,2,3]), Some(RangeInclusive::new(0, 0)));
293
+        assert_eq!(range_included(&RangeInclusive::new(2, 2), &[1,2,3]), Some(RangeInclusive::new(1, 1)));
294
+        assert_eq!(range_included(&RangeInclusive::new(3, 3), &[1,2,3]), Some(RangeInclusive::new(2, 2)));
295
+        assert_eq!(range_included(&RangeInclusive::new(4, 4), &[1,2,3]), None);
296
+        assert_eq!(range_included(&RangeInclusive::new(0, 1), &[1,2,3]), Some(RangeInclusive::new(0, 0)));
297
+        assert_eq!(range_included(&RangeInclusive::new(3, 4), &[1,2,3]), Some(RangeInclusive::new(2, 2)));
298
+        assert_eq!(range_included(&RangeInclusive::new(4, 4), &[1,2,6]), None);
299
+        assert_eq!(range_included(&RangeInclusive::new(2, 3), &[1,2,6]), Some(RangeInclusive::new(1, 1)));
300
+        assert_eq!(range_included(&RangeInclusive::new(5, 6), &[1,2,6]), Some(RangeInclusive::new(2, 2)));
301
+        assert_eq!(range_included(&RangeInclusive::new(5, 8), &[1,2,6]), Some(RangeInclusive::new(2, 2)));
302
+        assert_eq!(range_included(&RangeInclusive::new(0, 8), &[1,2,6]), Some(RangeInclusive::new(0, 2)));
303
+        assert_eq!(range_included(&RangeInclusive::new(0, 4), &[1,2,6]), Some(RangeInclusive::new(0, 1)));
304
+    }
305
+}

+ 2
- 0
src/lib.rs Целия файл

@@ -81,6 +81,7 @@ pub use block::*;
81 81
 pub use dense::*;
82 82
 pub use elements::*;
83 83
 pub use error::{BlobError, Error, ErrorKind, Result};
84
+pub use indexed::*;
84 85
 pub use mmap_blob::*;
85 86
 pub use reader::*;
86 87
 
@@ -89,6 +90,7 @@ pub mod block;
89 90
 pub mod dense;
90 91
 pub mod elements;
91 92
 mod error;
93
+pub mod indexed;
92 94
 pub mod mmap_blob;
93 95
 mod proto;
94 96
 pub mod reader;

+ 29
- 0
tests/read.rs Целия файл

@@ -198,3 +198,32 @@ fn par_read_elements() {
198 198
         assert_eq!(elements, 5);
199 199
     }
200 200
 }
201
+
202
+#[test]
203
+fn read_ways_and_deps() {
204
+    for path in &TEST_FILE_PATHS {
205
+        let mut reader = IndexedReader::from_path(path).unwrap();
206
+
207
+        let mut ways = 0;
208
+        let mut nodes = 0;
209
+
210
+        reader.read_ways_and_deps(
211
+            |way| {
212
+                way.tags()
213
+                   .find(|&key_value| key_value == ("building", "yes"))
214
+                   .is_some()
215
+            },
216
+            |element| {
217
+                match element {
218
+                    Element::Way(_) => ways += 1,
219
+                    Element::Node(_) => nodes += 1,
220
+                    Element::DenseNode(_) => nodes += 1,
221
+                    Element::Relation(_) => panic!(), // should not occur
222
+                }
223
+            },
224
+        ).unwrap();
225
+
226
+        assert_eq!(ways, 1);
227
+        assert_eq!(nodes, 3);
228
+    }
229
+}