source: titan/mediathek/localhoster/lib/helpers.py @ 41267

Last change on this file since 41267 was 41178, checked in by obi, 6 years ago

fix

File size: 10.0 KB
Line 
1"""
2    URLResolver Addon for Kodi
3    Copyright (C) 2016 t0mm0, tknorris
4
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation, either version 3 of the License, or
8    (at your option) any later version.
9
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.
17"""
18import re
19import urllib
20#import xbmcgui
21import jsunpack
22from urlparse import urlparse
23#from urlresolver import common
24#from urlresolver.resolver import ResolverError
25import common
26from net import Net
27
28def get_hidden(html, form_id=None, index=None, include_submit=True):
29    hidden = {}
30    if form_id:
31        pattern = '''<form [^>]*(?:id|name)\s*=\s*['"]?%s['"]?[^>]*>(.*?)</form>''' % (form_id)
32    else:
33        pattern = '''<form[^>]*>(.*?)</form>'''
34   
35    html = cleanse_html(html)
36       
37    for i, form in enumerate(re.finditer(pattern, html, re.DOTALL | re.I)):
38 #       common.logger.log(form.group(1))
39        if index is None or i == index:
40            for field in re.finditer('''<input [^>]*type=['"]?hidden['"]?[^>]*>''', form.group(1)):
41                match = re.search('''name\s*=\s*['"]([^'"]+)''', field.group(0))
42                match1 = re.search('''value\s*=\s*['"]([^'"]*)''', field.group(0))
43                if match and match1:
44                    hidden[match.group(1)] = match1.group(1)
45           
46            if include_submit:
47                match = re.search('''<input [^>]*type=['"]?submit['"]?[^>]*>''', form.group(1))
48                if match:
49                    name = re.search('''name\s*=\s*['"]([^'"]+)''', match.group(0))
50                    value = re.search('''value\s*=\s*['"]([^'"]*)''', match.group(0))
51                    if name and value:
52                        hidden[name.group(1)] = value.group(1)
53           
54 #   common.logger.log_debug('Hidden fields are: %s' % (hidden))
55    return hidden
56
57def pick_source(sources, auto_pick=None):
58#    if auto_pick is None:
59#        auto_pick = common.get_setting('auto_pick') == 'true'
60       
61    if len(sources) == 1:
62        return sources[0][1]
63    elif len(sources) > 1:
64        if auto_pick:
65            return sources[0][1]
66        else:
67            return sources[0][1]
68
69#            result = xbmcgui.Dialog().select(common.i18n('choose_the_link'), [str(source[0]) if source[0] else 'Unknown' for source in sources])
70#            if result == -1:
71#                raise ResolverError(common.i18n('no_link_selected'))
72#            else:
73#                return sources[result][1]
74#    else:
75#        raise ResolverError(common.i18n('no_video_link'))
76
77
78def append_headers(headers):
79    return '|%s' % '&'.join(['%s=%s' % (key, urllib.quote_plus(headers[key])) for key in headers])
80
81def get_packed_data(html):
82    packed_data = ''
83    for match in re.finditer('(eval\s*\(function.*?)</script>', html, re.DOTALL | re.I):
84        try:
85            js_data = jsunpack.unpack(match.group(1))
86            js_data = js_data.replace('\\', '')
87            packed_data += js_data
88        except:
89            pass
90       
91    return packed_data
92
93def parse_sources_list(html):
94    sources = []
95    match = re.search('''['"]?sources['"]?\s*:\s*\[(.*?)\]''', html, re.DOTALL)
96    if match:
97        sources = [(match[1], match[0].replace('\/', '/')) for match in re.findall('''['"]?file['"]?\s*:\s*['"]([^'"]+)['"][^}]*['"]?label['"]?\s*:\s*['"]([^'"]*)''', match.group(1), re.DOTALL)]
98    return sources
99
100def parse_html5_source_list(html):
101    label_attrib = 'type' if not re.search('''<source\s+src\s*=.*?data-res\s*=.*?/\s*>''', html) else 'data-res'
102    sources = [(match[1], match[0].replace('\/', '/')) for match in re.findall('''<source\s+src\s*=\s*['"]([^'"]+)['"](?:.*?''' + label_attrib + '''\s*=\s*['"](?:video/)?([^'"]+)['"])''', html, re.DOTALL)]
103    return sources
104
105def parse_smil_source_list(smil):
106    sources = []
107    base = re.search('base\s*=\s*"([^"]+)', smil).groups()[0]
108    for i in re.finditer('src\s*=\s*"([^"]+)(?:"\s*(?:width|height)\s*=\s*"([^"]+))?', smil):
109        label = 'Unknown'
110        if (len(i.groups()) > 1) and (i.group(2) is not None):
111            label = i.group(2)
112        sources += [(label, '%s playpath=%s' % (base, i.group(1)))]
113    return sources
114
115def scrape_sources(html, result_blacklist=None, scheme='http', patterns=None, generic_patterns=True):
116    if patterns is None: patterns = []
117   
118    def __parse_to_list(_html, regex):
119        _blacklist = ['.jpg', '.jpeg', '.gif', '.png', '.js', '.css', '.htm', '.html', '.php', '.srt', '.sub', '.xml', '.swf', '.vtt', '.mpd']
120        _blacklist = set(_blacklist + result_blacklist)
121        streams = []
122        labels = []
123        for r in re.finditer(regex, _html, re.DOTALL):
124            match = r.groupdict()
125            stream_url = match['url'].replace('&amp;', '&')
126            file_name = urlparse(stream_url[:-1]).path.split('/')[-1] if stream_url.endswith("/") else urlparse(stream_url).path.split('/')[-1]
127            blocked = not file_name or any(item in file_name.lower() for item in _blacklist)
128            if stream_url.startswith('//'): stream_url = scheme + ':' + stream_url
129            if '://' not in stream_url or blocked or (stream_url in streams) or any(stream_url == t[1] for t in source_list):
130                continue
131   
132            label = match.get('label', file_name)
133            if label is None: label = file_name
134            labels.append(label)
135            streams.append(stream_url)
136           
137        matches = zip(labels, streams)
138#        if matches:
139#            common.logger.log_debug('Scrape sources |%s| found |%s|' % (regex, matches))
140        return matches
141
142    if result_blacklist is None:
143        result_blacklist = []
144    elif isinstance(result_blacklist, str):
145        result_blacklist = [result_blacklist]
146       
147    html = html.replace("\/", "/")
148    html += get_packed_data(html)
149
150    source_list = []
151    if generic_patterns or not patterns:
152        source_list += __parse_to_list(html, '''["']?label\s*["']?\s*[:=]\s*["']?(?P<label>[^"',]+)["']?(?:[^}\]]+)["']?\s*file\s*["']?\s*[:=,]?\s*["'](?P<url>[^"']+)''')
153        source_list += __parse_to_list(html, '''["']?\s*(?:file|src)\s*["']?\s*[:=,]?\s*["'](?P<url>[^"']+)(?:[^}>\]]+)["']?\s*label\s*["']?\s*[:=]\s*["']?(?P<label>[^"',]+)''')
154        source_list += __parse_to_list(html, '''video[^><]+src\s*[=:]\s*['"](?P<url>[^'"]+)''')
155        source_list += __parse_to_list(html, '''source\s+src\s*=\s*['"](?P<url>[^'"]+)['"](?:.*?res\s*=\s*['"](?P<label>[^'"]+))?''')
156        source_list += __parse_to_list(html, '''["'](?:file|url)["']\s*[:=]\s*["'](?P<url>[^"']+)''')
157        source_list += __parse_to_list(html, '''param\s+name\s*=\s*"src"\s*value\s*=\s*"(?P<url>[^"]+)''')
158    for regex in patterns:
159        source_list += __parse_to_list(html, regex)
160       
161    source_list = list(set(source_list))
162   
163 #   common.logger.log(source_list)
164    if len(source_list) > 1:
165        try: source_list.sort(key=lambda x: int(re.sub("\D", "", x[0])), reverse=True)
166        except:
167            test = 1
168 #           common.logger.log_debug('Scrape sources sort failed |int(re.sub("\D", "", x[0])|')
169            try: source_list.sort(key=lambda x: re.sub("[^a-zA-Z]", "", x[0]))
170            except:
171                test = 2
172 #               common.logger.log_debug('Scrape sources sort failed |re.sub("[^a-zA-Z]", "", x[0])|')
173
174    return source_list
175
176
177def get_media_url(url, result_blacklist=None, patterns=None, generic_patterns=True):
178    if patterns is None: patterns = []
179    scheme = urlparse(url).scheme
180    if result_blacklist is None:
181        result_blacklist = []
182    elif isinstance(result_blacklist, str):
183        result_blacklist = [result_blacklist]
184
185    result_blacklist = list(set(result_blacklist + ['.smil']))  # smil(not playable) contains potential sources, only blacklist when called from here
186    net = common.Net()
187    headers = {'User-Agent': common.RAND_UA}
188
189    response = net.http_GET(url, headers=headers)
190    response_headers = response.get_headers(as_dict=True)
191    headers.update({'Referer': url})
192    cookie = response_headers.get('Set-Cookie', None)
193    if cookie:
194        headers.update({'Cookie': cookie})
195    html = response.content
196
197    source_list = scrape_sources(html, result_blacklist, scheme, patterns, generic_patterns)
198    source = pick_source(source_list)
199    return source + append_headers(headers)
200
201def cleanse_html(html):
202    for match in re.finditer('<!--(.*?)-->', html, re.DOTALL):
203        if match.group(1)[-2:] != '//': html = html.replace(match.group(0), '')
204   
205    html = re.sub('''<(div|span)[^>]+style=["'](visibility:\s*hidden|display:\s*none);?["']>.*?</\\1>''', '', html, re.I | re.DOTALL)
206    return html
207
208def get_dom(html, tag):
209    start_str = '<%s' % (tag.lower())
210    end_str = '</%s' % (tag.lower())
211   
212    results = []
213    html = html.lower()
214    while html:
215        start = html.find(start_str)
216        end = html.find(end_str, start)
217        pos = html.find(start_str, start + 1)
218        while pos < end and pos != -1:
219            tend = html.find(end_str, end + len(end_str))
220            if tend != -1: end = tend
221            pos = html.find(start_str, pos + 1)
222       
223        if start == -1 and end == -1:
224            break
225        elif start > -1 and end > -1:
226            result = html[start:end]
227        elif end > -1:
228            result = html[:end]
229        elif start > -1:
230            result = html[start:]
231        else:
232            break
233           
234        results.append(result)
235        html = html[start + len(start_str):]
236   
237    return results
238
Note: See TracBrowser for help on using the repository browser.