python-lxml/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch at master · linux-up/python-lxml · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Fri, 15 Nov 2013 14:49:48 +0100
Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
 allow graceful handling of broken URLs

---
 CHANGES.txt                               |  4 +++
 src/lxml/html/__init__.py                 | 49 +++++++++++++++++++++++++------
 src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
 3 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index ea88d2b..dd52611 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -294,15 +294,21 @@ class HtmlMixin(object):
     ## Link functions
     ########################################

-    def make_links_absolute(self, base_url=None, resolve_base_href=True):
+    def make_links_absolute(self, base_url=None, resolve_base_href=True,
+                            handle_failures=None):
         """
         Make all links in the document absolute, given the
         ``base_url`` for the document (the full URL where the document
-        came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
+        came from), or if no ``base_url`` is given, then the ``.base_url``
+        of the document.

         If ``resolve_base_href`` is true, then any ``<base href>``
         tags in the document are used *and* removed from the document.
         If it is false then any such tag is ignored.
+
+        If ``handle_failures`` is None (default), a failure to process
+        a URL will abort the processing.  If set to 'ignore', errors
+        are ignored.  If set to 'discard', failing URLs will be removed.
         """
         if base_url is None:
             base_url = self.base_url
@@ -311,24 +317,48 @@ class HtmlMixin(object):
                     "No base_url given, and the document has no base_url")
         if resolve_base_href:
             self.resolve_base_href()
-        def link_repl(href):
-            return urljoin(base_url, href)
+
+        if handle_failures == 'ignore':
+            def link_repl(href):
+                try:
+                    return urljoin(base_url, href)
+                except ValueError:
+                    return href
+        elif handle_failures == 'discard':
+            def link_repl(href):
+                try:
+                    return urljoin(base_url, href)
+                except ValueError:
+                    return None
+        elif handle_failures is None:
+            def link_repl(href):
+                return urljoin(base_url, href)
+        else:
+            raise ValueError(
+                "unexpected value for handle_failures: %r" % handle_failures)
+
         self.rewrite_links(link_repl)

-    def resolve_base_href(self):
+    def resolve_base_href(self, handle_failures=None):
         """
         Find any ``<base href>`` tag in the document, and apply its
         values to all links found in the document.  Also remove the
         tag once it has been applied.
+
+        If ``handle_failures`` is None (default), a failure to process
+        a URL will abort the processing.  If set to 'ignore', errors
+        are ignored.  If set to 'discard', failing URLs will be removed.
         """
         base_href = None
-        basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
+        basetags = self.xpath('//base[@href]|//x:base[@href]',
+                              namespaces={'x': XHTML_NAMESPACE})
         for b in basetags:
             base_href = b.get('href')
             b.drop_tree()
         if not base_href:
             return
-        self.make_links_absolute(base_href, resolve_base_href=False)
+        self.make_links_absolute(base_href, resolve_base_href=False,
+                                 handle_failures=handle_failures)

     def iterlinks(self):
         """
@@ -434,6 +464,7 @@ class HtmlMixin(object):
                 base_href, resolve_base_href=resolve_base_href)
         elif resolve_base_href:
             self.resolve_base_href()
+
         for el, attrib, link, pos in self.iterlinks():
             new_link = link_repl_func(link.strip())
             if new_link == link:
diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
index 43dd99d..dd400b7 100644
--- a/src/lxml/html/tests/test_rewritelinks.txt
+++ b/src/lxml/html/tests/test_rewritelinks.txt
@@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
      </body>
     </html>

+If the document contains invalid links, you may choose to "discard" or "ignore"
+them by passing the respective option into the ``handle_failures`` argument::
+
+    >>> html = lxml.html.fromstring ('''\
+    ... <html><body><div>
+    ...     <a href="http://fancybase.com]Buy">test2</a>
+    ... </div></body></html>''')
+
+    >>> html.make_links_absolute(base_url="http://my.little.server/url/",
+    ...                          handle_failures="discard")
+
+    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
+    <html><body><div>
+        <a>test2</a>
+    </div></body></html>
+
 Check if we can replace multiple links inside of the same text string::

     >>> html = lxml.html.fromstring ("""\
@@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::

     >>> html.make_links_absolute ()

-    >>> try: _unicode = unicode
-    ... except NameError: _unicode = str
-
-    >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
+    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
     <html>
       <head>
         <title>Test</title>
--
1.8.4.3