1 | |
---|
2 | """ |
---|
3 | csv.py - read/write/investigate CSV files |
---|
4 | """ |
---|
5 | |
---|
6 | import re |
---|
7 | from functools import reduce |
---|
8 | from _csv import Error, __version__, writer, reader, register_dialect, \ |
---|
9 | unregister_dialect, get_dialect, list_dialects, \ |
---|
10 | field_size_limit, \ |
---|
11 | QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ |
---|
12 | __doc__ |
---|
13 | from _csv import Dialect as _Dialect |
---|
14 | |
---|
15 | try: |
---|
16 | from cStringIO import StringIO |
---|
17 | except ImportError: |
---|
18 | from StringIO import StringIO |
---|
19 | |
---|
20 | __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", |
---|
21 | "Error", "Dialect", "__doc__", "excel", "excel_tab", |
---|
22 | "field_size_limit", "reader", "writer", |
---|
23 | "register_dialect", "get_dialect", "list_dialects", "Sniffer", |
---|
24 | "unregister_dialect", "__version__", "DictReader", "DictWriter" ] |
---|
25 | |
---|
26 | class Dialect: |
---|
27 | """Describe an Excel dialect. |
---|
28 | |
---|
29 | This must be subclassed (see csv.excel). Valid attributes are: |
---|
30 | delimiter, quotechar, escapechar, doublequote, skipinitialspace, |
---|
31 | lineterminator, quoting. |
---|
32 | |
---|
33 | """ |
---|
34 | _name = "" |
---|
35 | _valid = False |
---|
36 | # placeholders |
---|
37 | delimiter = None |
---|
38 | quotechar = None |
---|
39 | escapechar = None |
---|
40 | doublequote = None |
---|
41 | skipinitialspace = None |
---|
42 | lineterminator = None |
---|
43 | quoting = None |
---|
44 | |
---|
45 | def __init__(self): |
---|
46 | if self.__class__ != Dialect: |
---|
47 | self._valid = True |
---|
48 | self._validate() |
---|
49 | |
---|
50 | def _validate(self): |
---|
51 | try: |
---|
52 | _Dialect(self) |
---|
53 | except TypeError, e: |
---|
54 | # We do this for compatibility with py2.3 |
---|
55 | raise Error(str(e)) |
---|
56 | |
---|
57 | class excel(Dialect): |
---|
58 | """Describe the usual properties of Excel-generated CSV files.""" |
---|
59 | delimiter = ',' |
---|
60 | quotechar = '"' |
---|
61 | doublequote = True |
---|
62 | skipinitialspace = False |
---|
63 | lineterminator = '\r\n' |
---|
64 | quoting = QUOTE_MINIMAL |
---|
65 | register_dialect("excel", excel) |
---|
66 | |
---|
67 | class excel_tab(excel): |
---|
68 | """Describe the usual properties of Excel-generated TAB-delimited files.""" |
---|
69 | delimiter = '\t' |
---|
70 | register_dialect("excel-tab", excel_tab) |
---|
71 | |
---|
72 | |
---|
73 | class DictReader: |
---|
74 | def __init__(self, f, fieldnames=None, restkey=None, restval=None, |
---|
75 | dialect="excel", *args, **kwds): |
---|
76 | self._fieldnames = fieldnames # list of keys for the dict |
---|
77 | self.restkey = restkey # key to catch long rows |
---|
78 | self.restval = restval # default value for short rows |
---|
79 | self.reader = reader(f, dialect, *args, **kwds) |
---|
80 | self.dialect = dialect |
---|
81 | self.line_num = 0 |
---|
82 | |
---|
83 | def __iter__(self): |
---|
84 | return self |
---|
85 | |
---|
86 | @property |
---|
87 | def fieldnames(self): |
---|
88 | if self._fieldnames is None: |
---|
89 | try: |
---|
90 | self._fieldnames = self.reader.next() |
---|
91 | except StopIteration: |
---|
92 | pass |
---|
93 | self.line_num = self.reader.line_num |
---|
94 | return self._fieldnames |
---|
95 | |
---|
96 | # Issue 20004: Because DictReader is a classic class, this setter is |
---|
97 | # ignored. At this point in 2.7's lifecycle, it is too late to change the |
---|
98 | # base class for fear of breaking working code. If you want to change |
---|
99 | # fieldnames without overwriting the getter, set _fieldnames directly. |
---|
100 | @fieldnames.setter |
---|
101 | def fieldnames(self, value): |
---|
102 | self._fieldnames = value |
---|
103 | |
---|
104 | def next(self): |
---|
105 | if self.line_num == 0: |
---|
106 | # Used only for its side effect. |
---|
107 | self.fieldnames |
---|
108 | row = self.reader.next() |
---|
109 | self.line_num = self.reader.line_num |
---|
110 | |
---|
111 | # unlike the basic reader, we prefer not to return blanks, |
---|
112 | # because we will typically wind up with a dict full of None |
---|
113 | # values |
---|
114 | while row == []: |
---|
115 | row = self.reader.next() |
---|
116 | d = dict(zip(self.fieldnames, row)) |
---|
117 | lf = len(self.fieldnames) |
---|
118 | lr = len(row) |
---|
119 | if lf < lr: |
---|
120 | d[self.restkey] = row[lf:] |
---|
121 | elif lf > lr: |
---|
122 | for key in self.fieldnames[lr:]: |
---|
123 | d[key] = self.restval |
---|
124 | return d |
---|
125 | |
---|
126 | |
---|
127 | class DictWriter: |
---|
128 | def __init__(self, f, fieldnames, restval="", extrasaction="raise", |
---|
129 | dialect="excel", *args, **kwds): |
---|
130 | self.fieldnames = fieldnames # list of keys for the dict |
---|
131 | self.restval = restval # for writing short dicts |
---|
132 | if extrasaction.lower() not in ("raise", "ignore"): |
---|
133 | raise ValueError, \ |
---|
134 | ("extrasaction (%s) must be 'raise' or 'ignore'" % |
---|
135 | extrasaction) |
---|
136 | self.extrasaction = extrasaction |
---|
137 | self.writer = writer(f, dialect, *args, **kwds) |
---|
138 | |
---|
139 | def writeheader(self): |
---|
140 | header = dict(zip(self.fieldnames, self.fieldnames)) |
---|
141 | self.writerow(header) |
---|
142 | |
---|
143 | def _dict_to_list(self, rowdict): |
---|
144 | if self.extrasaction == "raise": |
---|
145 | wrong_fields = [k for k in rowdict if k not in self.fieldnames] |
---|
146 | if wrong_fields: |
---|
147 | raise ValueError("dict contains fields not in fieldnames: " |
---|
148 | + ", ".join([repr(x) for x in wrong_fields])) |
---|
149 | return [rowdict.get(key, self.restval) for key in self.fieldnames] |
---|
150 | |
---|
151 | def writerow(self, rowdict): |
---|
152 | return self.writer.writerow(self._dict_to_list(rowdict)) |
---|
153 | |
---|
154 | def writerows(self, rowdicts): |
---|
155 | rows = [] |
---|
156 | for rowdict in rowdicts: |
---|
157 | rows.append(self._dict_to_list(rowdict)) |
---|
158 | return self.writer.writerows(rows) |
---|
159 | |
---|
160 | # Guard Sniffer's type checking against builds that exclude complex() |
---|
161 | try: |
---|
162 | complex |
---|
163 | except NameError: |
---|
164 | complex = float |
---|
165 | |
---|
166 | class Sniffer: |
---|
167 | ''' |
---|
168 | "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) |
---|
169 | Returns a Dialect object. |
---|
170 | ''' |
---|
171 | def __init__(self): |
---|
172 | # in case there is more than one possible delimiter |
---|
173 | self.preferred = [',', '\t', ';', ' ', ':'] |
---|
174 | |
---|
175 | |
---|
176 | def sniff(self, sample, delimiters=None): |
---|
177 | """ |
---|
178 | Returns a dialect (or None) corresponding to the sample |
---|
179 | """ |
---|
180 | |
---|
181 | quotechar, doublequote, delimiter, skipinitialspace = \ |
---|
182 | self._guess_quote_and_delimiter(sample, delimiters) |
---|
183 | if not delimiter: |
---|
184 | delimiter, skipinitialspace = self._guess_delimiter(sample, |
---|
185 | delimiters) |
---|
186 | |
---|
187 | if not delimiter: |
---|
188 | raise Error, "Could not determine delimiter" |
---|
189 | |
---|
190 | class dialect(Dialect): |
---|
191 | _name = "sniffed" |
---|
192 | lineterminator = '\r\n' |
---|
193 | quoting = QUOTE_MINIMAL |
---|
194 | # escapechar = '' |
---|
195 | |
---|
196 | dialect.doublequote = doublequote |
---|
197 | dialect.delimiter = delimiter |
---|
198 | # _csv.reader won't accept a quotechar of '' |
---|
199 | dialect.quotechar = quotechar or '"' |
---|
200 | dialect.skipinitialspace = skipinitialspace |
---|
201 | |
---|
202 | return dialect |
---|
203 | |
---|
204 | |
---|
205 | def _guess_quote_and_delimiter(self, data, delimiters): |
---|
206 | """ |
---|
207 | Looks for text enclosed between two identical quotes |
---|
208 | (the probable quotechar) which are preceded and followed |
---|
209 | by the same character (the probable delimiter). |
---|
210 | For example: |
---|
211 | ,'some text', |
---|
212 | The quote with the most wins, same with the delimiter. |
---|
213 | If there is no quotechar the delimiter can't be determined |
---|
214 | this way. |
---|
215 | """ |
---|
216 | |
---|
217 | matches = [] |
---|
218 | for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", |
---|
219 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", |
---|
220 | '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" |
---|
221 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) |
---|
222 | regexp = re.compile(restr, re.DOTALL | re.MULTILINE) |
---|
223 | matches = regexp.findall(data) |
---|
224 | if matches: |
---|
225 | break |
---|
226 | |
---|
227 | if not matches: |
---|
228 | # (quotechar, doublequote, delimiter, skipinitialspace) |
---|
229 | return ('', False, None, 0) |
---|
230 | quotes = {} |
---|
231 | delims = {} |
---|
232 | spaces = 0 |
---|
233 | for m in matches: |
---|
234 | n = regexp.groupindex['quote'] - 1 |
---|
235 | key = m[n] |
---|
236 | if key: |
---|
237 | quotes[key] = quotes.get(key, 0) + 1 |
---|
238 | try: |
---|
239 | n = regexp.groupindex['delim'] - 1 |
---|
240 | key = m[n] |
---|
241 | except KeyError: |
---|
242 | continue |
---|
243 | if key and (delimiters is None or key in delimiters): |
---|
244 | delims[key] = delims.get(key, 0) + 1 |
---|
245 | try: |
---|
246 | n = regexp.groupindex['space'] - 1 |
---|
247 | except KeyError: |
---|
248 | continue |
---|
249 | if m[n]: |
---|
250 | spaces += 1 |
---|
251 | |
---|
252 | quotechar = reduce(lambda a, b, quotes = quotes: |
---|
253 | (quotes[a] > quotes[b]) and a or b, quotes.keys()) |
---|
254 | |
---|
255 | if delims: |
---|
256 | delim = reduce(lambda a, b, delims = delims: |
---|
257 | (delims[a] > delims[b]) and a or b, delims.keys()) |
---|
258 | skipinitialspace = delims[delim] == spaces |
---|
259 | if delim == '\n': # most likely a file with a single column |
---|
260 | delim = '' |
---|
261 | else: |
---|
262 | # there is *no* delimiter, it's a single column of quoted data |
---|
263 | delim = '' |
---|
264 | skipinitialspace = 0 |
---|
265 | |
---|
266 | # if we see an extra quote between delimiters, we've got a |
---|
267 | # double quoted format |
---|
268 | dq_regexp = re.compile( |
---|
269 | r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ |
---|
270 | {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) |
---|
271 | |
---|
272 | |
---|
273 | |
---|
274 | if dq_regexp.search(data): |
---|
275 | doublequote = True |
---|
276 | else: |
---|
277 | doublequote = False |
---|
278 | |
---|
279 | return (quotechar, doublequote, delim, skipinitialspace) |
---|
280 | |
---|
281 | |
---|
282 | def _guess_delimiter(self, data, delimiters): |
---|
283 | """ |
---|
284 | The delimiter /should/ occur the same number of times on |
---|
285 | each row. However, due to malformed data, it may not. We don't want |
---|
286 | an all or nothing approach, so we allow for small variations in this |
---|
287 | number. |
---|
288 | 1) build a table of the frequency of each character on every line. |
---|
289 | 2) build a table of frequencies of this frequency (meta-frequency?), |
---|
290 | e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, |
---|
291 | 7 times in 2 rows' |
---|
292 | 3) use the mode of the meta-frequency to determine the /expected/ |
---|
293 | frequency for that character |
---|
294 | 4) find out how often the character actually meets that goal |
---|
295 | 5) the character that best meets its goal is the delimiter |
---|
296 | For performance reasons, the data is evaluated in chunks, so it can |
---|
297 | try and evaluate the smallest portion of the data possible, evaluating |
---|
298 | additional chunks as necessary. |
---|
299 | """ |
---|
300 | |
---|
301 | data = filter(None, data.split('\n')) |
---|
302 | |
---|
303 | ascii = [chr(c) for c in range(127)] # 7-bit ASCII |
---|
304 | |
---|
305 | # build frequency tables |
---|
306 | chunkLength = min(10, len(data)) |
---|
307 | iteration = 0 |
---|
308 | charFrequency = {} |
---|
309 | modes = {} |
---|
310 | delims = {} |
---|
311 | start, end = 0, min(chunkLength, len(data)) |
---|
312 | while start < len(data): |
---|
313 | iteration += 1 |
---|
314 | for line in data[start:end]: |
---|
315 | for char in ascii: |
---|
316 | metaFrequency = charFrequency.get(char, {}) |
---|
317 | # must count even if frequency is 0 |
---|
318 | freq = line.count(char) |
---|
319 | # value is the mode |
---|
320 | metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 |
---|
321 | charFrequency[char] = metaFrequency |
---|
322 | |
---|
323 | for char in charFrequency.keys(): |
---|
324 | items = charFrequency[char].items() |
---|
325 | if len(items) == 1 and items[0][0] == 0: |
---|
326 | continue |
---|
327 | # get the mode of the frequencies |
---|
328 | if len(items) > 1: |
---|
329 | modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, |
---|
330 | items) |
---|
331 | # adjust the mode - subtract the sum of all |
---|
332 | # other frequencies |
---|
333 | items.remove(modes[char]) |
---|
334 | modes[char] = (modes[char][0], modes[char][1] |
---|
335 | - reduce(lambda a, b: (0, a[1] + b[1]), |
---|
336 | items)[1]) |
---|
337 | else: |
---|
338 | modes[char] = items[0] |
---|
339 | |
---|
340 | # build a list of possible delimiters |
---|
341 | modeList = modes.items() |
---|
342 | total = float(chunkLength * iteration) |
---|
343 | # (rows of consistent data) / (number of rows) = 100% |
---|
344 | consistency = 1.0 |
---|
345 | # minimum consistency threshold |
---|
346 | threshold = 0.9 |
---|
347 | while len(delims) == 0 and consistency >= threshold: |
---|
348 | for k, v in modeList: |
---|
349 | if v[0] > 0 and v[1] > 0: |
---|
350 | if ((v[1]/total) >= consistency and |
---|
351 | (delimiters is None or k in delimiters)): |
---|
352 | delims[k] = v |
---|
353 | consistency -= 0.01 |
---|
354 | |
---|
355 | if len(delims) == 1: |
---|
356 | delim = delims.keys()[0] |
---|
357 | skipinitialspace = (data[0].count(delim) == |
---|
358 | data[0].count("%c " % delim)) |
---|
359 | return (delim, skipinitialspace) |
---|
360 | |
---|
361 | # analyze another chunkLength lines |
---|
362 | start = end |
---|
363 | end += chunkLength |
---|
364 | |
---|
365 | if not delims: |
---|
366 | return ('', 0) |
---|
367 | |
---|
368 | # if there's more than one, fall back to a 'preferred' list |
---|
369 | if len(delims) > 1: |
---|
370 | for d in self.preferred: |
---|
371 | if d in delims.keys(): |
---|
372 | skipinitialspace = (data[0].count(d) == |
---|
373 | data[0].count("%c " % d)) |
---|
374 | return (d, skipinitialspace) |
---|
375 | |
---|
376 | # nothing else indicates a preference, pick the character that |
---|
377 | # dominates(?) |
---|
378 | items = [(v,k) for (k,v) in delims.items()] |
---|
379 | items.sort() |
---|
380 | delim = items[-1][1] |
---|
381 | |
---|
382 | skipinitialspace = (data[0].count(delim) == |
---|
383 | data[0].count("%c " % delim)) |
---|
384 | return (delim, skipinitialspace) |
---|
385 | |
---|
386 | |
---|
387 | def has_header(self, sample): |
---|
388 | # Creates a dictionary of types of data in each column. If any |
---|
389 | # column is of a single type (say, integers), *except* for the first |
---|
390 | # row, then the first row is presumed to be labels. If the type |
---|
391 | # can't be determined, it is assumed to be a string in which case |
---|
392 | # the length of the string is the determining factor: if all of the |
---|
393 | # rows except for the first are the same length, it's a header. |
---|
394 | # Finally, a 'vote' is taken at the end for each column, adding or |
---|
395 | # subtracting from the likelihood of the first row being a header. |
---|
396 | |
---|
397 | rdr = reader(StringIO(sample), self.sniff(sample)) |
---|
398 | |
---|
399 | header = rdr.next() # assume first row is header |
---|
400 | |
---|
401 | columns = len(header) |
---|
402 | columnTypes = {} |
---|
403 | for i in range(columns): columnTypes[i] = None |
---|
404 | |
---|
405 | checked = 0 |
---|
406 | for row in rdr: |
---|
407 | # arbitrary number of rows to check, to keep it sane |
---|
408 | if checked > 20: |
---|
409 | break |
---|
410 | checked += 1 |
---|
411 | |
---|
412 | if len(row) != columns: |
---|
413 | continue # skip rows that have irregular number of columns |
---|
414 | |
---|
415 | for col in columnTypes.keys(): |
---|
416 | |
---|
417 | for thisType in [int, long, float, complex]: |
---|
418 | try: |
---|
419 | thisType(row[col]) |
---|
420 | break |
---|
421 | except (ValueError, OverflowError): |
---|
422 | pass |
---|
423 | else: |
---|
424 | # fallback to length of string |
---|
425 | thisType = len(row[col]) |
---|
426 | |
---|
427 | # treat longs as ints |
---|
428 | if thisType == long: |
---|
429 | thisType = int |
---|
430 | |
---|
431 | if thisType != columnTypes[col]: |
---|
432 | if columnTypes[col] is None: # add new column type |
---|
433 | columnTypes[col] = thisType |
---|
434 | else: |
---|
435 | # type is inconsistent, remove column from |
---|
436 | # consideration |
---|
437 | del columnTypes[col] |
---|
438 | |
---|
439 | # finally, compare results against first row and "vote" |
---|
440 | # on whether it's a header |
---|
441 | hasHeader = 0 |
---|
442 | for col, colType in columnTypes.items(): |
---|
443 | if type(colType) == type(0): # it's a length |
---|
444 | if len(header[col]) != colType: |
---|
445 | hasHeader += 1 |
---|
446 | else: |
---|
447 | hasHeader -= 1 |
---|
448 | else: # attempt typecast |
---|
449 | try: |
---|
450 | colType(header[col]) |
---|
451 | except (ValueError, TypeError): |
---|
452 | hasHeader += 1 |
---|
453 | else: |
---|
454 | hasHeader -= 1 |
---|
455 | |
---|
456 | return hasHeader > 0 |
---|