1 | """
|
---|
2 | URLResolver Addon for Kodi
|
---|
3 | Copyright (C) 2016 t0mm0, tknorris
|
---|
4 |
|
---|
5 | This program is free software: you can redistribute it and/or modify
|
---|
6 | it under the terms of the GNU General Public License as published by
|
---|
7 | the Free Software Foundation, either version 3 of the License, or
|
---|
8 | (at your option) any later version.
|
---|
9 |
|
---|
10 | This program is distributed in the hope that it will be useful,
|
---|
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | GNU General Public License for more details.
|
---|
14 |
|
---|
15 | You should have received a copy of the GNU General Public License
|
---|
16 | along with this program. If not, see <http://www.gnu.org/licenses/>.
|
---|
17 | """
|
---|
18 | import re
|
---|
19 | import urllib
|
---|
20 | #import xbmcgui
|
---|
21 | import jsunpack
|
---|
22 | from urlparse import urlparse
|
---|
23 | #from urlresolver import common
|
---|
24 | #from urlresolver.resolver import ResolverError
|
---|
25 | import common
|
---|
26 | from net import Net
|
---|
27 |
|
---|
28 | def get_hidden(html, form_id=None, index=None, include_submit=True):
|
---|
29 | hidden = {}
|
---|
30 | if form_id:
|
---|
31 | pattern = '''<form [^>]*(?:id|name)\s*=\s*['"]?%s['"]?[^>]*>(.*?)</form>''' % (form_id)
|
---|
32 | else:
|
---|
33 | pattern = '''<form[^>]*>(.*?)</form>'''
|
---|
34 |
|
---|
35 | html = cleanse_html(html)
|
---|
36 |
|
---|
37 | for i, form in enumerate(re.finditer(pattern, html, re.DOTALL | re.I)):
|
---|
38 | # common.logger.log(form.group(1))
|
---|
39 | if index is None or i == index:
|
---|
40 | for field in re.finditer('''<input [^>]*type=['"]?hidden['"]?[^>]*>''', form.group(1)):
|
---|
41 | match = re.search('''name\s*=\s*['"]([^'"]+)''', field.group(0))
|
---|
42 | match1 = re.search('''value\s*=\s*['"]([^'"]*)''', field.group(0))
|
---|
43 | if match and match1:
|
---|
44 | hidden[match.group(1)] = match1.group(1)
|
---|
45 |
|
---|
46 | if include_submit:
|
---|
47 | match = re.search('''<input [^>]*type=['"]?submit['"]?[^>]*>''', form.group(1))
|
---|
48 | if match:
|
---|
49 | name = re.search('''name\s*=\s*['"]([^'"]+)''', match.group(0))
|
---|
50 | value = re.search('''value\s*=\s*['"]([^'"]*)''', match.group(0))
|
---|
51 | if name and value:
|
---|
52 | hidden[name.group(1)] = value.group(1)
|
---|
53 |
|
---|
54 | # common.logger.log_debug('Hidden fields are: %s' % (hidden))
|
---|
55 | return hidden
|
---|
56 |
|
---|
57 | def pick_source(sources, auto_pick=None):
|
---|
58 | # if auto_pick is None:
|
---|
59 | # auto_pick = common.get_setting('auto_pick') == 'true'
|
---|
60 |
|
---|
61 | if len(sources) == 1:
|
---|
62 | return sources[0][1]
|
---|
63 | elif len(sources) > 1:
|
---|
64 | if auto_pick:
|
---|
65 | return sources[0][1]
|
---|
66 | else:
|
---|
67 | return sources[0][1]
|
---|
68 |
|
---|
69 | # result = xbmcgui.Dialog().select(common.i18n('choose_the_link'), [str(source[0]) if source[0] else 'Unknown' for source in sources])
|
---|
70 | # if result == -1:
|
---|
71 | # raise ResolverError(common.i18n('no_link_selected'))
|
---|
72 | # else:
|
---|
73 | # return sources[result][1]
|
---|
74 | # else:
|
---|
75 | # raise ResolverError(common.i18n('no_video_link'))
|
---|
76 |
|
---|
77 |
|
---|
78 | def append_headers(headers):
|
---|
79 | return '|%s' % '&'.join(['%s=%s' % (key, urllib.quote_plus(headers[key])) for key in headers])
|
---|
80 |
|
---|
81 | def get_packed_data(html):
|
---|
82 | packed_data = ''
|
---|
83 | for match in re.finditer('(eval\s*\(function.*?)</script>', html, re.DOTALL | re.I):
|
---|
84 | try:
|
---|
85 | js_data = jsunpack.unpack(match.group(1))
|
---|
86 | js_data = js_data.replace('\\', '')
|
---|
87 | packed_data += js_data
|
---|
88 | except:
|
---|
89 | pass
|
---|
90 |
|
---|
91 | return packed_data
|
---|
92 |
|
---|
93 | def parse_sources_list(html):
|
---|
94 | sources = []
|
---|
95 | match = re.search('''['"]?sources['"]?\s*:\s*\[(.*?)\]''', html, re.DOTALL)
|
---|
96 | if match:
|
---|
97 | sources = [(match[1], match[0].replace('\/', '/')) for match in re.findall('''['"]?file['"]?\s*:\s*['"]([^'"]+)['"][^}]*['"]?label['"]?\s*:\s*['"]([^'"]*)''', match.group(1), re.DOTALL)]
|
---|
98 | return sources
|
---|
99 |
|
---|
100 | def parse_html5_source_list(html):
|
---|
101 | label_attrib = 'type' if not re.search('''<source\s+src\s*=.*?data-res\s*=.*?/\s*>''', html) else 'data-res'
|
---|
102 | sources = [(match[1], match[0].replace('\/', '/')) for match in re.findall('''<source\s+src\s*=\s*['"]([^'"]+)['"](?:.*?''' + label_attrib + '''\s*=\s*['"](?:video/)?([^'"]+)['"])''', html, re.DOTALL)]
|
---|
103 | return sources
|
---|
104 |
|
---|
105 | def parse_smil_source_list(smil):
|
---|
106 | sources = []
|
---|
107 | base = re.search('base\s*=\s*"([^"]+)', smil).groups()[0]
|
---|
108 | for i in re.finditer('src\s*=\s*"([^"]+)(?:"\s*(?:width|height)\s*=\s*"([^"]+))?', smil):
|
---|
109 | label = 'Unknown'
|
---|
110 | if (len(i.groups()) > 1) and (i.group(2) is not None):
|
---|
111 | label = i.group(2)
|
---|
112 | sources += [(label, '%s playpath=%s' % (base, i.group(1)))]
|
---|
113 | return sources
|
---|
114 |
|
---|
115 | def scrape_sources(html, result_blacklist=None, scheme='http', patterns=None, generic_patterns=True):
|
---|
116 | if patterns is None: patterns = []
|
---|
117 |
|
---|
118 | def __parse_to_list(_html, regex):
|
---|
119 | _blacklist = ['.jpg', '.jpeg', '.gif', '.png', '.js', '.css', '.htm', '.html', '.php', '.srt', '.sub', '.xml', '.swf', '.vtt', '.mpd']
|
---|
120 | _blacklist = set(_blacklist + result_blacklist)
|
---|
121 | streams = []
|
---|
122 | labels = []
|
---|
123 | for r in re.finditer(regex, _html, re.DOTALL):
|
---|
124 | match = r.groupdict()
|
---|
125 | stream_url = match['url'].replace('&', '&')
|
---|
126 | file_name = urlparse(stream_url[:-1]).path.split('/')[-1] if stream_url.endswith("/") else urlparse(stream_url).path.split('/')[-1]
|
---|
127 | blocked = not file_name or any(item in file_name.lower() for item in _blacklist)
|
---|
128 | if stream_url.startswith('//'): stream_url = scheme + ':' + stream_url
|
---|
129 | if '://' not in stream_url or blocked or (stream_url in streams) or any(stream_url == t[1] for t in source_list):
|
---|
130 | continue
|
---|
131 |
|
---|
132 | label = match.get('label', file_name)
|
---|
133 | if label is None: label = file_name
|
---|
134 | labels.append(label)
|
---|
135 | streams.append(stream_url)
|
---|
136 |
|
---|
137 | matches = zip(labels, streams)
|
---|
138 | # if matches:
|
---|
139 | # common.logger.log_debug('Scrape sources |%s| found |%s|' % (regex, matches))
|
---|
140 | return matches
|
---|
141 |
|
---|
142 | if result_blacklist is None:
|
---|
143 | result_blacklist = []
|
---|
144 | elif isinstance(result_blacklist, str):
|
---|
145 | result_blacklist = [result_blacklist]
|
---|
146 |
|
---|
147 | html = html.replace("\/", "/")
|
---|
148 | html += get_packed_data(html)
|
---|
149 |
|
---|
150 | source_list = []
|
---|
151 | if generic_patterns or not patterns:
|
---|
152 | source_list += __parse_to_list(html, '''["']?label\s*["']?\s*[:=]\s*["']?(?P<label>[^"',]+)["']?(?:[^}\]]+)["']?\s*file\s*["']?\s*[:=,]?\s*["'](?P<url>[^"']+)''')
|
---|
153 | source_list += __parse_to_list(html, '''["']?\s*(?:file|src)\s*["']?\s*[:=,]?\s*["'](?P<url>[^"']+)(?:[^}>\]]+)["']?\s*label\s*["']?\s*[:=]\s*["']?(?P<label>[^"',]+)''')
|
---|
154 | source_list += __parse_to_list(html, '''video[^><]+src\s*[=:]\s*['"](?P<url>[^'"]+)''')
|
---|
155 | source_list += __parse_to_list(html, '''source\s+src\s*=\s*['"](?P<url>[^'"]+)['"](?:.*?res\s*=\s*['"](?P<label>[^'"]+))?''')
|
---|
156 | source_list += __parse_to_list(html, '''["'](?:file|url)["']\s*[:=]\s*["'](?P<url>[^"']+)''')
|
---|
157 | source_list += __parse_to_list(html, '''param\s+name\s*=\s*"src"\s*value\s*=\s*"(?P<url>[^"]+)''')
|
---|
158 | for regex in patterns:
|
---|
159 | source_list += __parse_to_list(html, regex)
|
---|
160 |
|
---|
161 | source_list = list(set(source_list))
|
---|
162 |
|
---|
163 | # common.logger.log(source_list)
|
---|
164 | if len(source_list) > 1:
|
---|
165 | try: source_list.sort(key=lambda x: int(re.sub("\D", "", x[0])), reverse=True)
|
---|
166 | except:
|
---|
167 | test = 1
|
---|
168 | # common.logger.log_debug('Scrape sources sort failed |int(re.sub("\D", "", x[0])|')
|
---|
169 | try: source_list.sort(key=lambda x: re.sub("[^a-zA-Z]", "", x[0]))
|
---|
170 | except:
|
---|
171 | test = 2
|
---|
172 | common.logger.log_debug('Scrape sources sort failed |re.sub("[^a-zA-Z]", "", x[0])|')
|
---|
173 |
|
---|
174 | return source_list
|
---|
175 |
|
---|
176 |
|
---|
177 | def get_media_url(url, result_blacklist=None, patterns=None, generic_patterns=True):
|
---|
178 | if patterns is None: patterns = []
|
---|
179 | scheme = urlparse(url).scheme
|
---|
180 | if result_blacklist is None:
|
---|
181 | result_blacklist = []
|
---|
182 | elif isinstance(result_blacklist, str):
|
---|
183 | result_blacklist = [result_blacklist]
|
---|
184 |
|
---|
185 | result_blacklist = list(set(result_blacklist + ['.smil'])) # smil(not playable) contains potential sources, only blacklist when called from here
|
---|
186 | net = common.Net()
|
---|
187 | headers = {'User-Agent': common.RAND_UA}
|
---|
188 |
|
---|
189 | response = net.http_GET(url, headers=headers)
|
---|
190 | response_headers = response.get_headers(as_dict=True)
|
---|
191 | headers.update({'Referer': url})
|
---|
192 | cookie = response_headers.get('Set-Cookie', None)
|
---|
193 | if cookie:
|
---|
194 | headers.update({'Cookie': cookie})
|
---|
195 | html = response.content
|
---|
196 |
|
---|
197 | source_list = scrape_sources(html, result_blacklist, scheme, patterns, generic_patterns)
|
---|
198 | source = pick_source(source_list)
|
---|
199 | return source + append_headers(headers)
|
---|
200 |
|
---|
201 | def cleanse_html(html):
|
---|
202 | for match in re.finditer('<!--(.*?)-->', html, re.DOTALL):
|
---|
203 | if match.group(1)[-2:] != '//': html = html.replace(match.group(0), '')
|
---|
204 |
|
---|
205 | html = re.sub('''<(div|span)[^>]+style=["'](visibility:\s*hidden|display:\s*none);?["']>.*?</\\1>''', '', html, re.I | re.DOTALL)
|
---|
206 | return html
|
---|
207 |
|
---|
208 | def get_dom(html, tag):
|
---|
209 | start_str = '<%s' % (tag.lower())
|
---|
210 | end_str = '</%s' % (tag.lower())
|
---|
211 |
|
---|
212 | results = []
|
---|
213 | html = html.lower()
|
---|
214 | while html:
|
---|
215 | start = html.find(start_str)
|
---|
216 | end = html.find(end_str, start)
|
---|
217 | pos = html.find(start_str, start + 1)
|
---|
218 | while pos < end and pos != -1:
|
---|
219 | tend = html.find(end_str, end + len(end_str))
|
---|
220 | if tend != -1: end = tend
|
---|
221 | pos = html.find(start_str, pos + 1)
|
---|
222 |
|
---|
223 | if start == -1 and end == -1:
|
---|
224 | break
|
---|
225 | elif start > -1 and end > -1:
|
---|
226 | result = html[start:end]
|
---|
227 | elif end > -1:
|
---|
228 | result = html[:end]
|
---|
229 | elif start > -1:
|
---|
230 | result = html[start:]
|
---|
231 | else:
|
---|
232 | break
|
---|
233 |
|
---|
234 | results.append(result)
|
---|
235 | html = html[start + len(start_str):]
|
---|
236 |
|
---|
237 | return results
|
---|
238 |
|
---|