html2docx - PyPI Package Compare versions

+1

tests/data/table.html

+30

tests/data/table.json

		[
		{
		"table": [
		[{
		"cell": [{
		"text": "1",
		"runs": [{ "text": "1" }]
		}]
		},
		{
		"cell": [{
		"text": "2",
		"runs": [{ "text": "2", "bold": true}]
		}]
		}],
		[{
		"cell": [{
		"text": "3",
		"runs": [{ "text": "3" }]
		}]
		},
		{
		"cell": [{
		"text": "",
		"runs": []
		}]
		}]
		]
		}
		]

+1

-1

html2docx.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: html2docx
		Version: 1.5.0
		Version: 1.6.0
		Summary: Convert valid HTML input to docx.
		@@ -5,0 +5,0 @@ Home-page: https://github.com/erezlife/html2docx

+2

-0

html2docx.egg-info/SOURCES.txt

		@@ -104,2 +104,4 @@ LICENSE
		tests/data/sup.json
		tests/data/table.html
		tests/data/table.json
		tests/data/underline-strikethrough.html
		@@ -106,0 +108,0 @@ tests/data/underline-strikethrough.json

+44

-0

html2docx/html2docx.py

		@@ -8,2 +8,3 @@ import re
		from docx.shared import Pt
		from docx.table import Table
		from docx.text.paragraph import Paragraph
		@@ -82,2 +83,3 @@ from docx.text.run import Run
		self.pre = False
		self.table: Optional[Tuple[Table, int, int]] = None
		self.alignment: Optional[int] = None
		@@ -106,2 +108,33 @@ self.padding_left: Optional[Pt] = None

		def init_table(self, attrs: List[Tuple[str, Optional[str]]]) -> None:
		self.table = (self.doc.add_table(rows=0, cols=0), -1, -1)

		def finish_table(self) -> None:
		if self.table is None:
		return
		section = self.doc.sections[0]
		page_width = section.page_width - section.left_margin - section.right_margin
		table = self.table[0]
		for col in table.columns:
		col.width = page_width // len(table.columns)
		self.table = None

		def init_tr(self) -> None:
		if self.table is None:
		return
		table, row, col = self.table
		table.add_row()
		self.table = table, row + 1, -1

		def init_tdth(self) -> None:
		if self.table is None:
		return
		table, row, col = self.table
		col += 1
		self.table = (table, row, col)
		if col >= len(table.columns):
		table.add_column(0)
		self.p = self.table[0].cell(row, col).paragraphs[0]
		self.r = None

		def init_run(self, attrs: List[Tuple[str, Any]]) -> None:
		@@ -201,2 +234,8 @@ self.attrs.append(attrs)
		self.add_list_style("List Bullet")
		elif tag == "table":
		self.init_table(attrs)
		elif tag == "tr":
		self.init_tr()
		elif tag in ["td", "th"]:
		self.init_tdth()

		@@ -229,1 +268,6 @@ def handle_data(self, data: str) -> None:
		self.pre = False
		elif tag == "table":
		self.finish_table()
		elif tag in ["td", "th"]:
		self.p = None
		self.r = None

+1

-1

PKG-INFO

		Metadata-Version: 2.1
		Name: html2docx
		Version: 1.5.0
		Version: 1.6.0
		Summary: Convert valid HTML input to docx.
		@@ -5,0 +5,0 @@ Home-page: https://github.com/erezlife/html2docx

+2

-5

pyproject.toml

		[build-system]
		requires = [
		"setuptools>=42",
		"wheel",
		]
		build-backend = "setuptools.build_meta:__legacy__"
		requires = ["setuptools>=42"]
		build-backend = "setuptools.build_meta"

+1

-1

setup.cfg

		[metadata]
		name = html2docx
		version = 1.5.0
		version = 1.6.0
		url = https://github.com/erezlife/html2docx
		@@ -5,0 +5,0 @@ author = eRezLife

+84

-32

tests/test_html2docx.py

		import json
		from typing import Union

		import docx
		import pytest
		from docx.document import Document
		from docx.oxml import CT_P, CT_Tbl
		from docx.shared import Pt
		from docx.table import Table, _Cell
		from docx.text.paragraph import Paragraph

		@@ -29,2 +34,80 @@ from html2docx import html2docx

		def get_document_children(element: Union[Document, _Cell]):
		if isinstance(element, Document):
		parent_element = element.element.body
		elif isinstance(element, _Cell):
		parent_element = element._tc
		else:
		raise Exception("Received an item that does not have children.")
		for child in parent_element.iterchildren():
		if isinstance(child, CT_P):
		yield Paragraph(child, element)
		elif isinstance(child, CT_Tbl):
		yield Table(child, element)


		def assert_paragraph_comply_with_spec(
		p: Paragraph, p_spec: dict, html_rel_path: str, spec_rel_path: str
		):
		assert p.text == p_spec["text"]
		assert p.style.name == p_spec.get("style", "Normal")
		if p_spec.get("alignment") is not None:
		assert p.alignment == p_spec["alignment"]
		else:
		assert p.alignment is None
		if p_spec.get("left_indent"):
		assert p.paragraph_format.left_indent == Pt(p_spec["left_indent"])
		else:
		assert p.paragraph_format.left_indent is None

		runs_spec = p_spec["runs"]
		assert len(p.runs) == len(runs_spec)
		for run, run_spec in zip(p.runs, runs_spec):
		assert run.text == run_spec.pop("text")
		shapes_spec = run_spec.pop("shapes", None)
		unknown = set(run_spec).difference(FONT_ATTRS)
		assert not unknown, "Unknown attributes in {}: {}".format(
		spec_rel_path, ", ".join(unknown)
		)
		for attr in FONT_ATTRS:
		msg = f"Wrong {attr} for text '{run.text}' in {html_rel_path}"
		assert getattr(run.font, attr) == run_spec.get(attr), msg
		if shapes_spec:
		shapes = run.part.inline_shapes
		assert len(shapes) == len(shapes_spec)
		for shape, shape_spec in zip(shapes, shapes_spec):
		assert shape.type == shape_spec["type"]
		assert shape.width == shape_spec["width"]
		assert shape.height == shape_spec["height"]


		def assert_table_comply_with_spec(
		t: Table, t_spec: dict, html_rel_path: str, spec_rel_path: str
		):
		assert "table" in t_spec
		assert len(t.rows) == len(t_spec["table"])
		for (row, row_spec) in zip(t.rows, t_spec["table"]):
		assert len(t.columns) == len(row_spec)
		for (cell, cell_spec) in zip(row.cells, row_spec):
		assert_element_comply_with_spec(
		cell, cell_spec["cell"], html_rel_path, spec_rel_path
		)


		def assert_element_comply_with_spec(
		element: Union[Document, _Cell], spec: dict, html_rel_path: str, spec_rel_path: str
		):
		children = list(get_document_children(element))
		assert len(children) == len(spec)
		for child, child_spec in zip(children, spec):
		if isinstance(child, Paragraph):
		assert_paragraph_comply_with_spec(
		child, child_spec, html_rel_path, spec_rel_path
		)
		if isinstance(child, Table):
		assert_table_comply_with_spec(
		child, child_spec, html_rel_path, spec_rel_path
		)


		@pytest.mark.parametrize("html_path,spec_path", generate_testdata())
		@@ -44,33 +127,2 @@ def test_html2docx(html_path, spec_path):
		assert doc.core_properties.title == title
		assert len(doc.paragraphs) == len(spec)
		for p, p_spec in zip(doc.paragraphs, spec):
		assert p.text == p_spec["text"]
		assert p.style.name == p_spec.get("style", "Normal")
		if p_spec.get("alignment") is not None:
		assert p.alignment == p_spec["alignment"]
		else:
		assert p.alignment is None
		if p_spec.get("left_indent"):
		assert p.paragraph_format.left_indent == Pt(p_spec["left_indent"])
		else:
		assert p.paragraph_format.left_indent is None

		runs_spec = p_spec["runs"]
		assert len(p.runs) == len(runs_spec)
		for run, run_spec in zip(p.runs, runs_spec):
		assert run.text == run_spec.pop("text")
		shapes_spec = run_spec.pop("shapes", None)
		unknown = set(run_spec).difference(FONT_ATTRS)
		assert not unknown, "Unknown attributes in {}: {}".format(
		spec_rel_path, ", ".join(unknown)
		)
		for attr in FONT_ATTRS:
		msg = f"Wrong {attr} for text '{run.text}' in {html_rel_path}"
		assert getattr(run.font, attr) == run_spec.get(attr), msg
		if shapes_spec:
		shapes = run.part.inline_shapes
		assert len(shapes) == len(shapes_spec)
		for shape, shape_spec in zip(shapes, shapes_spec):
		assert shape.type == shape_spec["type"]
		assert shape.width == shape_spec["width"]
		assert shape.height == shape_spec["height"]
		assert_element_comply_with_spec(doc, spec, html_rel_path, spec_rel_path)

html2docx - pypi Package Compare versions

Improved metrics