From 2049c48d57050226044c1971e8ee13e4307ea60a Mon Sep 17 00:00:00 2001 From: Anton Sarukhanov <code@ant.sr> Date: Mon, 12 Dec 2016 08:13:18 -0500 Subject: [PATCH] Clean up HTMl instead of stripping it. --- .gitlab-ci.yml | 2 +- requirements.txt | 1 + wooify | 27 ++++++++++++++++++--------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c27eea5..1d95416 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,5 +1,5 @@ before_script: - - apt-get update -qq && apt-get install -y -qq python-virtualenv python3 python3-dev + - apt-get update -qq && apt-get install -y -qq python-virtualenv python3 python3-dev libxslt1-dev libxml2-dev - virtualenv venv -p$(which python3) - . venv/bin/activate - pip install -r requirements.txt diff --git a/requirements.txt b/requirements.txt index 46364b3..8743044 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ awesome-slugify==1.6.5 pep8==1.7.0 +lxml==3.7.0 diff --git a/wooify b/wooify index 07be835..87bd379 100644 --- a/wooify +++ b/wooify @@ -14,7 +14,8 @@ import csv import re import json import datetime -from xml.etree import ElementTree +import lxml.html.clean as lxml_clean +from lxml.etree import XMLSyntaxError from slugify import UniqueSlugify ECWID_CATEGORY_DELIMITER = ' / ' @@ -22,6 +23,10 @@ ECWID_CSV_DELIMITER = ';' WP_SLUG_REGEX = re.compile('[\W-]+') +HTML_ALLOWED_TAGS = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol', + 'li', 'b', 'i', 'strong', 'em', 'u', 'table', 'tr', 'td', + 'th', 'thead', 'tbody'] + WOO_IMP_EXP_PRODUCT_FIELDS = [ # ("FieldName", "Default Value"), ("Id", ""), @@ -102,18 +107,19 @@ WOO_IMP_EXP_CATEGORY_FIELDS = [ ] -def strip_html(html_string): - """Remove all HTML tags from a string.""" - # TODO: Dont actually strip all.. Just remove any inline styles. - return ''.join(ElementTree.fromstring("<body>{0}</body>" - .format(html_string)).itertext()) +def clean_html(html_string): + """Remove all attributes from HTML string.""" + cleaner = lxml_clean.Cleaner( + allow_tags=HTML_ALLOWED_TAGS, remove_unknown_tags=False, + page_structure=False, safe_attrs_only=True, safe_attrs=frozenset()) + return cleaner.clean_html("<div>{0}</div>".format(html_string)) def ecwid_parser(r, r_id=None): try: - r["description"] = strip_html(r["description"]) - except ElementTree.ParseError: - print("Malformed HTML could not be stripped: " + r["description"] = clean_html(r["description"]) + except XMLSyntaxError: + print("Error parsing HTML for product: " "{0} ({1})".format(r["name"], (r["sku"]))) raw_category_names = list(filter(None, [r["category{0}".format(c)] if r["category{0}".format(c)] else None @@ -215,6 +221,9 @@ def make_woocommerce_csv(input_file, products_file, categories_file=None): del c['Parent Name'] for product in products: + product['Categories'] = json.dumps([{ + 'name': categories[c]['Name'], 'slug': categories[c]['Slug']} + for c in product['Categories']]) products_writer.writerow(product) if csv_categories: -- GitLab