pygexml

1from .page import Page
2
3__all__ = ["Page"]
@dataclass
class Page(dataclasses_json.api.DataClassJsonMixin):
227@dataclass
228class Page(DataClassJsonMixin):
229    image_filename: str
230    regions: dict[ID, TextRegion]
231
232    @classmethod
233    def from_xml(cls, element: Element) -> "Page":
234        if QName(element).localname != "Page":
235            raise PageXMLError("Wrong element given")
236
237        if "imageFilename" not in element.attrib:
238            raise PageXMLError("No filename found")
239
240        regions = find_children(element, "TextRegion")
241
242        return Page(
243            image_filename=str(element.attrib["imageFilename"]),
244            regions={
245                tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions)
246            },
247        )
248
249    @classmethod
250    def from_xml_string(cls, xml_str: str) -> "Page":
251        root = etree.fromstring(xml_str.encode("utf-8"))
252        page_element = find_child(root, "Page")
253        if page_element is None:
254            raise PageXMLError("No page element found")
255        return cls.from_xml(page_element)
256
257    @classmethod
258    def from_xml_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page":
259        path = Path(file)
260        xml_string = path.read_text(encoding=encoding)
261        return Page.from_xml_string(xml_string)
262
263    @classmethod
264    def from_alto(cls, element: Element) -> "Page":
265        if QName(element).localname != "alto":
266            raise ALTOXMLError("Wrong element given")
267
268        image_element = find_child(element, "Description")
269        if image_element is None:
270            raise ALTOXMLError("No Description element found")
271        image_element = find_child(image_element, "sourceImageInformation")
272        if image_element is None:
273            raise ALTOXMLError("No sourceImageInformation element found")
274        filename_element = find_child(image_element, "fileName")
275        if filename_element is None:
276            raise ALTOXMLError("No fileName element found")
277        image_filename = (
278            filename_element.text if filename_element.text is not None else ""
279        )
280
281        layout = find_child(element, "Layout")
282        if layout is None:
283            raise ALTOXMLError("No Layout element found")
284        page_element = find_child(layout, "Page")
285        if page_element is None:
286            raise ALTOXMLError("No Page element found")
287        printspace_element = find_child(page_element, "PrintSpace")
288        if printspace_element is None:
289            raise ALTOXMLError("No PrintSpace element found")
290
291        text_blocks = find_children(printspace_element, "TextBlock")
292
293        return Page(
294            image_filename=image_filename,
295            regions={
296                tb.id: tb for tb in (TextRegion.from_alto(tb) for tb in text_blocks)
297            },
298        )
299
300    @classmethod
301    def from_alto_string(cls, xml_str: str) -> "Page":
302        root = etree.fromstring(xml_str.encode("utf-8"))
303        return cls.from_alto(root)
304
305    @classmethod
306    def from_alto_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page":
307        path = Path(file)
308        xml_string = path.read_text(encoding=encoding)
309        return Page.from_alto_string(xml_string)
310
311    def lookup_region(self, id: ID) -> TextRegion | None:
312        return self.regions.get(id)
313
314    def all_text(self) -> Iterable[str]:
315        return (line for region in self.regions.values() for line in region.all_text())
316
317    def all_words(self) -> Iterable[str]:
318        return (word for region in self.regions.values() for word in region.all_words())
Page(image_filename: str, regions: dict[str, pygexml.page.TextRegion])
image_filename: str
regions: dict[str, pygexml.page.TextRegion]
@classmethod
def from_xml(cls, element: lxml.etree._Element) -> Page:
232    @classmethod
233    def from_xml(cls, element: Element) -> "Page":
234        if QName(element).localname != "Page":
235            raise PageXMLError("Wrong element given")
236
237        if "imageFilename" not in element.attrib:
238            raise PageXMLError("No filename found")
239
240        regions = find_children(element, "TextRegion")
241
242        return Page(
243            image_filename=str(element.attrib["imageFilename"]),
244            regions={
245                tr.id: tr for tr in (TextRegion.from_xml(region) for region in regions)
246            },
247        )
@classmethod
def from_xml_string(cls, xml_str: str) -> Page:
249    @classmethod
250    def from_xml_string(cls, xml_str: str) -> "Page":
251        root = etree.fromstring(xml_str.encode("utf-8"))
252        page_element = find_child(root, "Page")
253        if page_element is None:
254            raise PageXMLError("No page element found")
255        return cls.from_xml(page_element)
@classmethod
def from_xml_file( cls, file: pathlib.Path | str, encoding: str = 'utf-8') -> Page:
257    @classmethod
258    def from_xml_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page":
259        path = Path(file)
260        xml_string = path.read_text(encoding=encoding)
261        return Page.from_xml_string(xml_string)
@classmethod
def from_alto(cls, element: lxml.etree._Element) -> Page:
263    @classmethod
264    def from_alto(cls, element: Element) -> "Page":
265        if QName(element).localname != "alto":
266            raise ALTOXMLError("Wrong element given")
267
268        image_element = find_child(element, "Description")
269        if image_element is None:
270            raise ALTOXMLError("No Description element found")
271        image_element = find_child(image_element, "sourceImageInformation")
272        if image_element is None:
273            raise ALTOXMLError("No sourceImageInformation element found")
274        filename_element = find_child(image_element, "fileName")
275        if filename_element is None:
276            raise ALTOXMLError("No fileName element found")
277        image_filename = (
278            filename_element.text if filename_element.text is not None else ""
279        )
280
281        layout = find_child(element, "Layout")
282        if layout is None:
283            raise ALTOXMLError("No Layout element found")
284        page_element = find_child(layout, "Page")
285        if page_element is None:
286            raise ALTOXMLError("No Page element found")
287        printspace_element = find_child(page_element, "PrintSpace")
288        if printspace_element is None:
289            raise ALTOXMLError("No PrintSpace element found")
290
291        text_blocks = find_children(printspace_element, "TextBlock")
292
293        return Page(
294            image_filename=image_filename,
295            regions={
296                tb.id: tb for tb in (TextRegion.from_alto(tb) for tb in text_blocks)
297            },
298        )
@classmethod
def from_alto_string(cls, xml_str: str) -> Page:
300    @classmethod
301    def from_alto_string(cls, xml_str: str) -> "Page":
302        root = etree.fromstring(xml_str.encode("utf-8"))
303        return cls.from_alto(root)
@classmethod
def from_alto_file( cls, file: pathlib.Path | str, encoding: str = 'utf-8') -> Page:
305    @classmethod
306    def from_alto_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page":
307        path = Path(file)
308        xml_string = path.read_text(encoding=encoding)
309        return Page.from_alto_string(xml_string)
def lookup_region(self, id: str) -> pygexml.page.TextRegion | None:
311    def lookup_region(self, id: ID) -> TextRegion | None:
312        return self.regions.get(id)
def all_text(self) -> Iterable[str]:
314    def all_text(self) -> Iterable[str]:
315        return (line for region in self.regions.values() for line in region.all_text())
def all_words(self) -> Iterable[str]:
317    def all_words(self) -> Iterable[str]:
318        return (word for region in self.regions.values() for word in region.all_words())