Skip to content
Snippets Groups Projects
Commit 2049c48d authored by Anton Sarukhanov's avatar Anton Sarukhanov
Browse files

Clean up HTMl instead of stripping it.

parent 73e92619
No related branches found
No related tags found
No related merge requests found
Pipeline #33 passed with stage
in 1 minute and 42 seconds
before_script:
- apt-get update -qq && apt-get install -y -qq python-virtualenv python3 python3-dev
- apt-get update -qq && apt-get install -y -qq python-virtualenv python3 python3-dev libxslt1-dev libxml2-dev
- virtualenv venv -p$(which python3)
- . venv/bin/activate
- pip install -r requirements.txt
......
awesome-slugify==1.6.5
pep8==1.7.0
lxml==3.7.0
......@@ -14,7 +14,8 @@ import csv
import re
import json
import datetime
from xml.etree import ElementTree
import lxml.html.clean as lxml_clean
from lxml.etree import XMLSyntaxError
from slugify import UniqueSlugify
ECWID_CATEGORY_DELIMITER = ' / '
......@@ -22,6 +23,10 @@ ECWID_CSV_DELIMITER = ';'
WP_SLUG_REGEX = re.compile('[\W-]+')
HTML_ALLOWED_TAGS = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol',
'li', 'b', 'i', 'strong', 'em', 'u', 'table', 'tr', 'td',
'th', 'thead', 'tbody']
WOO_IMP_EXP_PRODUCT_FIELDS = [
# ("FieldName", "Default Value"),
("Id", ""),
......@@ -102,18 +107,19 @@ WOO_IMP_EXP_CATEGORY_FIELDS = [
]
def strip_html(html_string):
"""Remove all HTML tags from a string."""
# TODO: Dont actually strip all.. Just remove any inline styles.
return ''.join(ElementTree.fromstring("<body>{0}</body>"
.format(html_string)).itertext())
def clean_html(html_string):
"""Remove all attributes from HTML string."""
cleaner = lxml_clean.Cleaner(
allow_tags=HTML_ALLOWED_TAGS, remove_unknown_tags=False,
page_structure=False, safe_attrs_only=True, safe_attrs=frozenset())
return cleaner.clean_html("<div>{0}</div>".format(html_string))
def ecwid_parser(r, r_id=None):
try:
r["description"] = strip_html(r["description"])
except ElementTree.ParseError:
print("Malformed HTML could not be stripped: "
r["description"] = clean_html(r["description"])
except XMLSyntaxError:
print("Error parsing HTML for product: "
"{0} ({1})".format(r["name"], (r["sku"])))
raw_category_names = list(filter(None, [r["category{0}".format(c)]
if r["category{0}".format(c)] else None
......@@ -215,6 +221,9 @@ def make_woocommerce_csv(input_file, products_file, categories_file=None):
del c['Parent Name']
for product in products:
product['Categories'] = json.dumps([{
'name': categories[c]['Name'], 'slug': categories[c]['Slug']}
for c in product['Categories']])
products_writer.writerow(product)
if csv_categories:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment