source: titan/mediathek/localhoster/lib/python2.7/robotparser.py @ 40099

Last change on this file since 40099 was 40094, checked in by obi, 7 years ago

tithek add yoztube-dl support

File size: 7.4 KB
Line 
1""" robotparser.py
2
3    Copyright (C) 2000  Bastian Kleineidam
4
5    You can choose between two licenses when using this package:
6    1) GNU GPLv2
7    2) PSF license for Python 2.2
8
9    The robots.txt Exclusion Protocol is implemented as specified in
10    http://www.robotstxt.org/norobots-rfc.txt
11
12"""
13import urlparse
14import urllib
15
16__all__ = ["RobotFileParser"]
17
18
19class RobotFileParser:
20    """ This class provides a set of methods to read, parse and answer
21    questions about a single robots.txt file.
22
23    """
24
25    def __init__(self, url=''):
26        self.entries = []
27        self.default_entry = None
28        self.disallow_all = False
29        self.allow_all = False
30        self.set_url(url)
31        self.last_checked = 0
32
33    def mtime(self):
34        """Returns the time the robots.txt file was last fetched.
35
36        This is useful for long-running web spiders that need to
37        check for new robots.txt files periodically.
38
39        """
40        return self.last_checked
41
42    def modified(self):
43        """Sets the time the robots.txt file was last fetched to the
44        current time.
45
46        """
47        import time
48        self.last_checked = time.time()
49
50    def set_url(self, url):
51        """Sets the URL referring to a robots.txt file."""
52        self.url = url
53        self.host, self.path = urlparse.urlparse(url)[1:3]
54
55    def read(self):
56        """Reads the robots.txt URL and feeds it to the parser."""
57        opener = URLopener()
58        f = opener.open(self.url)
59        lines = [line.strip() for line in f]
60        f.close()
61        self.errcode = opener.errcode
62        if self.errcode in (401, 403):
63            self.disallow_all = True
64        elif self.errcode >= 400 and self.errcode < 500:
65            self.allow_all = True
66        elif self.errcode == 200 and lines:
67            self.parse(lines)
68
69    def _add_entry(self, entry):
70        if "*" in entry.useragents:
71            # the default entry is considered last
72            if self.default_entry is None:
73                # the first default entry wins
74                self.default_entry = entry
75        else:
76            self.entries.append(entry)
77
78    def parse(self, lines):
79        """parse the input lines from a robots.txt file.
80           We allow that a user-agent: line is not preceded by
81           one or more blank lines."""
82        # states:
83        #   0: start state
84        #   1: saw user-agent line
85        #   2: saw an allow or disallow line
86        state = 0
87        linenumber = 0
88        entry = Entry()
89
90        self.modified()
91        for line in lines:
92            linenumber += 1
93            if not line:
94                if state == 1:
95                    entry = Entry()
96                    state = 0
97                elif state == 2:
98                    self._add_entry(entry)
99                    entry = Entry()
100                    state = 0
101            # remove optional comment and strip line
102            i = line.find('#')
103            if i >= 0:
104                line = line[:i]
105            line = line.strip()
106            if not line:
107                continue
108            line = line.split(':', 1)
109            if len(line) == 2:
110                line[0] = line[0].strip().lower()
111                line[1] = urllib.unquote(line[1].strip())
112                if line[0] == "user-agent":
113                    if state == 2:
114                        self._add_entry(entry)
115                        entry = Entry()
116                    entry.useragents.append(line[1])
117                    state = 1
118                elif line[0] == "disallow":
119                    if state != 0:
120                        entry.rulelines.append(RuleLine(line[1], False))
121                        state = 2
122                elif line[0] == "allow":
123                    if state != 0:
124                        entry.rulelines.append(RuleLine(line[1], True))
125                        state = 2
126        if state == 2:
127            self._add_entry(entry)
128
129
130    def can_fetch(self, useragent, url):
131        """using the parsed robots.txt decide if useragent can fetch url"""
132        if self.disallow_all:
133            return False
134        if self.allow_all:
135            return True
136
137        # Until the robots.txt file has been read or found not
138        # to exist, we must assume that no url is allowable.
139        # This prevents false positives when a user erronenously
140        # calls can_fetch() before calling read().
141        if not self.last_checked:
142            return False
143
144        # search for given user agent matches
145        # the first match counts
146        parsed_url = urlparse.urlparse(urllib.unquote(url))
147        url = urlparse.urlunparse(('', '', parsed_url.path,
148            parsed_url.params, parsed_url.query, parsed_url.fragment))
149        url = urllib.quote(url)
150        if not url:
151            url = "/"
152        for entry in self.entries:
153            if entry.applies_to(useragent):
154                return entry.allowance(url)
155        # try the default entry last
156        if self.default_entry:
157            return self.default_entry.allowance(url)
158        # agent not found ==> access granted
159        return True
160
161
162    def __str__(self):
163        return ''.join([str(entry) + "\n" for entry in self.entries])
164
165
166class RuleLine:
167    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
168       (allowance==False) followed by a path."""
169    def __init__(self, path, allowance):
170        if path == '' and not allowance:
171            # an empty value means allow all
172            allowance = True
173        path = urlparse.urlunparse(urlparse.urlparse(path))
174        self.path = urllib.quote(path)
175        self.allowance = allowance
176
177    def applies_to(self, filename):
178        return self.path == "*" or filename.startswith(self.path)
179
180    def __str__(self):
181        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
182
183
184class Entry:
185    """An entry has one or more user-agents and zero or more rulelines"""
186    def __init__(self):
187        self.useragents = []
188        self.rulelines = []
189
190    def __str__(self):
191        ret = []
192        for agent in self.useragents:
193            ret.extend(["User-agent: ", agent, "\n"])
194        for line in self.rulelines:
195            ret.extend([str(line), "\n"])
196        return ''.join(ret)
197
198    def applies_to(self, useragent):
199        """check if this entry applies to the specified agent"""
200        # split the name token and make it lower case
201        useragent = useragent.split("/")[0].lower()
202        for agent in self.useragents:
203            if agent == '*':
204                # we have the catch-all agent
205                return True
206            agent = agent.lower()
207            if agent in useragent:
208                return True
209        return False
210
211    def allowance(self, filename):
212        """Preconditions:
213        - our agent applies to this entry
214        - filename is URL decoded"""
215        for line in self.rulelines:
216            if line.applies_to(filename):
217                return line.allowance
218        return True
219
220class URLopener(urllib.FancyURLopener):
221    def __init__(self, *args):
222        urllib.FancyURLopener.__init__(self, *args)
223        self.errcode = 200
224
225    def prompt_user_passwd(self, host, realm):
226        ## If robots.txt file is accessible only with a password,
227        ## we act as if the file wasn't there.
228        return None, None
229
230    def http_error_default(self, url, fp, errcode, errmsg, headers):
231        self.errcode = errcode
232        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
233                                                        errmsg, headers)
Note: See TracBrowser for help on using the repository browser.