pygexml
@dataclass
class
Page227@dataclass 228class Page(DataClassJsonMixin): 229 image_filename: str 230 regions: dict[ID, TextRegion] 231 232 @classmethod 233 def from_xml(cls, element: Element) -> "Page": 234 if QName(element).localname != "Page": 235 raise PageXMLError("Wrong element given") 236 237 if "imageFilename" not in element.attrib: 238 raise PageXMLError("No filename found") 239 240 regions = find_children(element, "TextRegion") 241 242 return Page( 243 image_filename=str(element.attrib["imageFilename"]), 244 regions={ 245 tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions) 246 }, 247 ) 248 249 @classmethod 250 def from_xml_string(cls, xml_str: str) -> "Page": 251 root = etree.fromstring(xml_str.encode("utf-8")) 252 page_element = find_child(root, "Page") 253 if page_element is None: 254 raise PageXMLError("No page element found") 255 return cls.from_xml(page_element) 256 257 @classmethod 258 def from_xml_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page": 259 path = Path(file) 260 xml_string = path.read_text(encoding=encoding) 261 return Page.from_xml_string(xml_string) 262 263 @classmethod 264 def from_alto(cls, element: Element) -> "Page": 265 if QName(element).localname != "alto": 266 raise ALTOXMLError("Wrong element given") 267 268 image_element = find_child(element, "Description") 269 if image_element is None: 270 raise ALTOXMLError("No Description element found") 271 image_element = find_child(image_element, "sourceImageInformation") 272 if image_element is None: 273 raise ALTOXMLError("No sourceImageInformation element found") 274 filename_element = find_child(image_element, "fileName") 275 if filename_element is None: 276 raise ALTOXMLError("No fileName element found") 277 image_filename = ( 278 filename_element.text if filename_element.text is not None else "" 279 ) 280 281 layout = find_child(element, "Layout") 282 if layout is None: 283 raise ALTOXMLError("No Layout element found") 284 page_element = find_child(layout, "Page") 285 if page_element is None: 286 raise ALTOXMLError("No Page element found") 287 printspace_element = find_child(page_element, "PrintSpace") 288 if printspace_element is None: 289 raise ALTOXMLError("No PrintSpace element found") 290 291 text_blocks = find_children(printspace_element, "TextBlock") 292 293 return Page( 294 image_filename=image_filename, 295 regions={ 296 tb.id: tb for tb in (TextRegion.from_alto(tb) for tb in text_blocks) 297 }, 298 ) 299 300 @classmethod 301 def from_alto_string(cls, xml_str: str) -> "Page": 302 root = etree.fromstring(xml_str.encode("utf-8")) 303 return cls.from_alto(root) 304 305 @classmethod 306 def from_alto_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page": 307 path = Path(file) 308 xml_string = path.read_text(encoding=encoding) 309 return Page.from_alto_string(xml_string) 310 311 def lookup_region(self, id: ID) -> TextRegion | None: 312 return self.regions.get(id) 313 314 def all_text(self) -> Iterable[str]: 315 return (line for region in self.regions.values() for line in region.all_text()) 316 317 def all_words(self) -> Iterable[str]: 318 return (word for region in self.regions.values() for word in region.all_words())
Page(image_filename: str, regions: dict[str, pygexml.page.TextRegion])
regions: dict[str, pygexml.page.TextRegion]
232 @classmethod 233 def from_xml(cls, element: Element) -> "Page": 234 if QName(element).localname != "Page": 235 raise PageXMLError("Wrong element given") 236 237 if "imageFilename" not in element.attrib: 238 raise PageXMLError("No filename found") 239 240 regions = find_children(element, "TextRegion") 241 242 return Page( 243 image_filename=str(element.attrib["imageFilename"]), 244 regions={ 245 tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions) 246 }, 247 )
263 @classmethod 264 def from_alto(cls, element: Element) -> "Page": 265 if QName(element).localname != "alto": 266 raise ALTOXMLError("Wrong element given") 267 268 image_element = find_child(element, "Description") 269 if image_element is None: 270 raise ALTOXMLError("No Description element found") 271 image_element = find_child(image_element, "sourceImageInformation") 272 if image_element is None: 273 raise ALTOXMLError("No sourceImageInformation element found") 274 filename_element = find_child(image_element, "fileName") 275 if filename_element is None: 276 raise ALTOXMLError("No fileName element found") 277 image_filename = ( 278 filename_element.text if filename_element.text is not None else "" 279 ) 280 281 layout = find_child(element, "Layout") 282 if layout is None: 283 raise ALTOXMLError("No Layout element found") 284 page_element = find_child(layout, "Page") 285 if page_element is None: 286 raise ALTOXMLError("No Page element found") 287 printspace_element = find_child(page_element, "PrintSpace") 288 if printspace_element is None: 289 raise ALTOXMLError("No PrintSpace element found") 290 291 text_blocks = find_children(printspace_element, "TextBlock") 292 293 return Page( 294 image_filename=image_filename, 295 regions={ 296 tb.id: tb for tb in (TextRegion.from_alto(tb) for tb in text_blocks) 297 }, 298 )