html2docx
Advanced tools
| <table><tr><td>1</td><td><b>2</b></td></tr><tr><td>3</td></tr></table> |
| [ | ||
| { | ||
| "table": [ | ||
| [{ | ||
| "cell": [{ | ||
| "text": "1", | ||
| "runs": [{ "text": "1" }] | ||
| }] | ||
| }, | ||
| { | ||
| "cell": [{ | ||
| "text": "2", | ||
| "runs": [{ "text": "2", "bold": true}] | ||
| }] | ||
| }], | ||
| [{ | ||
| "cell": [{ | ||
| "text": "3", | ||
| "runs": [{ "text": "3" }] | ||
| }] | ||
| }, | ||
| { | ||
| "cell": [{ | ||
| "text": "", | ||
| "runs": [] | ||
| }] | ||
| }] | ||
| ] | ||
| } | ||
| ] |
| Metadata-Version: 2.1 | ||
| Name: html2docx | ||
| Version: 1.5.0 | ||
| Version: 1.6.0 | ||
| Summary: Convert valid HTML input to docx. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/erezlife/html2docx |
@@ -104,2 +104,4 @@ LICENSE | ||
| tests/data/sup.json | ||
| tests/data/table.html | ||
| tests/data/table.json | ||
| tests/data/underline-strikethrough.html | ||
@@ -106,0 +108,0 @@ tests/data/underline-strikethrough.json |
@@ -8,2 +8,3 @@ import re | ||
| from docx.shared import Pt | ||
| from docx.table import Table | ||
| from docx.text.paragraph import Paragraph | ||
@@ -82,2 +83,3 @@ from docx.text.run import Run | ||
| self.pre = False | ||
| self.table: Optional[Tuple[Table, int, int]] = None | ||
| self.alignment: Optional[int] = None | ||
@@ -106,2 +108,33 @@ self.padding_left: Optional[Pt] = None | ||
| def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None: | ||
| self.table = (self.doc.add_table(rows=0, cols=0), -1, -1) | ||
| def finish_table(self) -> None: | ||
| if self.table is None: | ||
| return | ||
| section = self.doc.sections[0] | ||
| page_width = section.page_width - section.left_margin - section.right_margin | ||
| table = self.table[0] | ||
| for col in table.columns: | ||
| col.width = page_width // len(table.columns) | ||
| self.table = None | ||
| def init_tr(self) -> None: | ||
| if self.table is None: | ||
| return | ||
| table, row, col = self.table | ||
| table.add_row() | ||
| self.table = table, row + 1, -1 | ||
| def init_tdth(self) -> None: | ||
| if self.table is None: | ||
| return | ||
| table, row, col = self.table | ||
| col += 1 | ||
| self.table = (table, row, col) | ||
| if col >= len(table.columns): | ||
| table.add_column(0) | ||
| self.p = self.table[0].cell(row, col).paragraphs[0] | ||
| self.r = None | ||
| def init_run(self, attrs: List[Tuple[str, Any]]) -> None: | ||
@@ -201,2 +234,8 @@ self.attrs.append(attrs) | ||
| self.add_list_style("List Bullet") | ||
| elif tag == "table": | ||
| self.init_table(attrs) | ||
| elif tag == "tr": | ||
| self.init_tr() | ||
| elif tag in ["td", "th"]: | ||
| self.init_tdth() | ||
@@ -229,1 +268,6 @@ def handle_data(self, data: str) -> None: | ||
| self.pre = False | ||
| elif tag == "table": | ||
| self.finish_table() | ||
| elif tag in ["td", "th"]: | ||
| self.p = None | ||
| self.r = None |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: html2docx | ||
| Version: 1.5.0 | ||
| Version: 1.6.0 | ||
| Summary: Convert valid HTML input to docx. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/erezlife/html2docx |
+2
-5
| [build-system] | ||
| requires = [ | ||
| "setuptools>=42", | ||
| "wheel", | ||
| ] | ||
| build-backend = "setuptools.build_meta:__legacy__" | ||
| requires = ["setuptools>=42"] | ||
| build-backend = "setuptools.build_meta" |
+1
-1
| [metadata] | ||
| name = html2docx | ||
| version = 1.5.0 | ||
| version = 1.6.0 | ||
| url = https://github.com/erezlife/html2docx | ||
@@ -5,0 +5,0 @@ author = eRezLife |
+84
-32
| import json | ||
| from typing import Union | ||
| import docx | ||
| import pytest | ||
| from docx.document import Document | ||
| from docx.oxml import CT_P, CT_Tbl | ||
| from docx.shared import Pt | ||
| from docx.table import Table, _Cell | ||
| from docx.text.paragraph import Paragraph | ||
@@ -29,2 +34,80 @@ from html2docx import html2docx | ||
| def get_document_children(element: Union[Document, _Cell]): | ||
| if isinstance(element, Document): | ||
| parent_element = element.element.body | ||
| elif isinstance(element, _Cell): | ||
| parent_element = element._tc | ||
| else: | ||
| raise Exception("Received an item that does not have children.") | ||
| for child in parent_element.iterchildren(): | ||
| if isinstance(child, CT_P): | ||
| yield Paragraph(child, element) | ||
| elif isinstance(child, CT_Tbl): | ||
| yield Table(child, element) | ||
| def assert_paragraph_comply_with_spec( | ||
| p: Paragraph, p_spec: dict, html_rel_path: str, spec_rel_path: str | ||
| ): | ||
| assert p.text == p_spec["text"] | ||
| assert p.style.name == p_spec.get("style", "Normal") | ||
| if p_spec.get("alignment") is not None: | ||
| assert p.alignment == p_spec["alignment"] | ||
| else: | ||
| assert p.alignment is None | ||
| if p_spec.get("left_indent"): | ||
| assert p.paragraph_format.left_indent == Pt(p_spec["left_indent"]) | ||
| else: | ||
| assert p.paragraph_format.left_indent is None | ||
| runs_spec = p_spec["runs"] | ||
| assert len(p.runs) == len(runs_spec) | ||
| for run, run_spec in zip(p.runs, runs_spec): | ||
| assert run.text == run_spec.pop("text") | ||
| shapes_spec = run_spec.pop("shapes", None) | ||
| unknown = set(run_spec).difference(FONT_ATTRS) | ||
| assert not unknown, "Unknown attributes in {}: {}".format( | ||
| spec_rel_path, ", ".join(unknown) | ||
| ) | ||
| for attr in FONT_ATTRS: | ||
| msg = f"Wrong {attr} for text '{run.text}' in {html_rel_path}" | ||
| assert getattr(run.font, attr) == run_spec.get(attr), msg | ||
| if shapes_spec: | ||
| shapes = run.part.inline_shapes | ||
| assert len(shapes) == len(shapes_spec) | ||
| for shape, shape_spec in zip(shapes, shapes_spec): | ||
| assert shape.type == shape_spec["type"] | ||
| assert shape.width == shape_spec["width"] | ||
| assert shape.height == shape_spec["height"] | ||
| def assert_table_comply_with_spec( | ||
| t: Table, t_spec: dict, html_rel_path: str, spec_rel_path: str | ||
| ): | ||
| assert "table" in t_spec | ||
| assert len(t.rows) == len(t_spec["table"]) | ||
| for (row, row_spec) in zip(t.rows, t_spec["table"]): | ||
| assert len(t.columns) == len(row_spec) | ||
| for (cell, cell_spec) in zip(row.cells, row_spec): | ||
| assert_element_comply_with_spec( | ||
| cell, cell_spec["cell"], html_rel_path, spec_rel_path | ||
| ) | ||
| def assert_element_comply_with_spec( | ||
| element: Union[Document, _Cell], spec: dict, html_rel_path: str, spec_rel_path: str | ||
| ): | ||
| children = list(get_document_children(element)) | ||
| assert len(children) == len(spec) | ||
| for child, child_spec in zip(children, spec): | ||
| if isinstance(child, Paragraph): | ||
| assert_paragraph_comply_with_spec( | ||
| child, child_spec, html_rel_path, spec_rel_path | ||
| ) | ||
| if isinstance(child, Table): | ||
| assert_table_comply_with_spec( | ||
| child, child_spec, html_rel_path, spec_rel_path | ||
| ) | ||
| @pytest.mark.parametrize("html_path,spec_path", generate_testdata()) | ||
@@ -44,33 +127,2 @@ def test_html2docx(html_path, spec_path): | ||
| assert doc.core_properties.title == title | ||
| assert len(doc.paragraphs) == len(spec) | ||
| for p, p_spec in zip(doc.paragraphs, spec): | ||
| assert p.text == p_spec["text"] | ||
| assert p.style.name == p_spec.get("style", "Normal") | ||
| if p_spec.get("alignment") is not None: | ||
| assert p.alignment == p_spec["alignment"] | ||
| else: | ||
| assert p.alignment is None | ||
| if p_spec.get("left_indent"): | ||
| assert p.paragraph_format.left_indent == Pt(p_spec["left_indent"]) | ||
| else: | ||
| assert p.paragraph_format.left_indent is None | ||
| runs_spec = p_spec["runs"] | ||
| assert len(p.runs) == len(runs_spec) | ||
| for run, run_spec in zip(p.runs, runs_spec): | ||
| assert run.text == run_spec.pop("text") | ||
| shapes_spec = run_spec.pop("shapes", None) | ||
| unknown = set(run_spec).difference(FONT_ATTRS) | ||
| assert not unknown, "Unknown attributes in {}: {}".format( | ||
| spec_rel_path, ", ".join(unknown) | ||
| ) | ||
| for attr in FONT_ATTRS: | ||
| msg = f"Wrong {attr} for text '{run.text}' in {html_rel_path}" | ||
| assert getattr(run.font, attr) == run_spec.get(attr), msg | ||
| if shapes_spec: | ||
| shapes = run.part.inline_shapes | ||
| assert len(shapes) == len(shapes_spec) | ||
| for shape, shape_spec in zip(shapes, shapes_spec): | ||
| assert shape.type == shape_spec["type"] | ||
| assert shape.width == shape_spec["width"] | ||
| assert shape.height == shape_spec["height"] | ||
| assert_element_comply_with_spec(doc, spec, html_rel_path, spec_rel_path) |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
93480
4.37%110
1.85%1595
7.7%