From 2049c48d57050226044c1971e8ee13e4307ea60a Mon Sep 17 00:00:00 2001
From: Anton Sarukhanov <code@ant.sr>
Date: Mon, 12 Dec 2016 08:13:18 -0500
Subject: [PATCH] Clean up HTMl instead of stripping it.

---
 .gitlab-ci.yml   |  2 +-
 requirements.txt |  1 +
 wooify           | 27 ++++++++++++++++++---------
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c27eea5..1d95416 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,5 +1,5 @@
 before_script:
-  - apt-get update -qq && apt-get install -y -qq python-virtualenv python3 python3-dev
+  - apt-get update -qq && apt-get install -y -qq python-virtualenv python3 python3-dev libxslt1-dev libxml2-dev
   - virtualenv venv -p$(which python3)
   - . venv/bin/activate
   - pip install -r requirements.txt
diff --git a/requirements.txt b/requirements.txt
index 46364b3..8743044 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 awesome-slugify==1.6.5
 pep8==1.7.0
+lxml==3.7.0
diff --git a/wooify b/wooify
index 07be835..87bd379 100644
--- a/wooify
+++ b/wooify
@@ -14,7 +14,8 @@ import csv
 import re
 import json
 import datetime
-from xml.etree import ElementTree
+import lxml.html.clean as lxml_clean
+from lxml.etree import XMLSyntaxError
 from slugify import UniqueSlugify
 
 ECWID_CATEGORY_DELIMITER = ' / '
@@ -22,6 +23,10 @@ ECWID_CSV_DELIMITER = ';'
 
 WP_SLUG_REGEX = re.compile('[\W-]+')
 
+HTML_ALLOWED_TAGS = ['a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'ul', 'ol',
+                     'li', 'b', 'i', 'strong', 'em', 'u', 'table', 'tr', 'td',
+                     'th', 'thead', 'tbody']
+
 WOO_IMP_EXP_PRODUCT_FIELDS = [
     # ("FieldName", "Default Value"),
     ("Id", ""),
@@ -102,18 +107,19 @@ WOO_IMP_EXP_CATEGORY_FIELDS = [
 ]
 
 
-def strip_html(html_string):
-    """Remove all HTML tags from a string."""
-    # TODO: Dont actually strip all.. Just remove any inline styles.
-    return ''.join(ElementTree.fromstring("<body>{0}</body>"
-                                          .format(html_string)).itertext())
+def clean_html(html_string):
+    """Remove all attributes from HTML string."""
+    cleaner = lxml_clean.Cleaner(
+        allow_tags=HTML_ALLOWED_TAGS, remove_unknown_tags=False,
+        page_structure=False, safe_attrs_only=True, safe_attrs=frozenset())
+    return cleaner.clean_html("<div>{0}</div>".format(html_string))
 
 
 def ecwid_parser(r, r_id=None):
     try:
-        r["description"] = strip_html(r["description"])
-    except ElementTree.ParseError:
-        print("Malformed HTML could not be stripped: "
+        r["description"] = clean_html(r["description"])
+    except XMLSyntaxError:
+        print("Error parsing HTML for product: "
               "{0} ({1})".format(r["name"], (r["sku"])))
     raw_category_names = list(filter(None, [r["category{0}".format(c)]
                                      if r["category{0}".format(c)] else None
@@ -215,6 +221,9 @@ def make_woocommerce_csv(input_file, products_file, categories_file=None):
             del c['Parent Name']
 
         for product in products:
+            product['Categories'] = json.dumps([{
+                'name': categories[c]['Name'], 'slug': categories[c]['Slug']}
+                for c in product['Categories']])
             products_writer.writerow(product)
 
         if csv_categories:
-- 
GitLab