From 60d75019271defd8c413b0768288b5069064435f Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 27 Mar 2023 12:07:51 +0200 Subject: [PATCH 01/44] fixing has_header --- Lib/csv.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 4ef8be45ca9e0a..1e9483960010a5 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -4,7 +4,6 @@ """ import re -import types from _csv import Error, __version__, writer, reader, register_dialect, \ unregister_dialect, get_dialect, list_dialects, \ field_size_limit, \ @@ -81,8 +80,6 @@ class unix_dialect(Dialect): class DictReader: def __init__(self, f, fieldnames=None, restkey=None, restval=None, dialect="excel", *args, **kwds): - if fieldnames is not None and iter(fieldnames) is fieldnames: - fieldnames = list(fieldnames) self._fieldnames = fieldnames # list of keys for the dict self.restkey = restkey # key to catch long rows self.restval = restval # default value for short rows @@ -129,18 +126,13 @@ def __next__(self): d[key] = self.restval return d - __class_getitem__ = classmethod(types.GenericAlias) - class DictWriter: def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds): - if fieldnames is not None and iter(fieldnames) is fieldnames: - fieldnames = list(fieldnames) self.fieldnames = fieldnames # list of keys for the dict self.restval = restval # for writing short dicts - extrasaction = extrasaction.lower() - if extrasaction not in ("raise", "ignore"): + if extrasaction.lower() not in ("raise", "ignore"): raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" % extrasaction) self.extrasaction = extrasaction @@ -164,8 +156,11 @@ def writerow(self, rowdict): def writerows(self, rowdicts): return self.writer.writerows(map(self._dict_to_list, rowdicts)) - __class_getitem__ = classmethod(types.GenericAlias) - +# Guard Sniffer's type checking against builds that exclude complex() +try: + complex +except NameError: + complex = float class Sniffer: ''' @@ -396,15 +391,16 @@ def has_header(self, sample): # subtracting from the likelihood of the first row being a header. rdr = reader(StringIO(sample), self.sniff(sample)) - + header = next(rdr) # assume first row is header - columns = len(header) columnTypes = {} + average_size = 0 + col_are_strings = True for i in range(columns): columnTypes[i] = None - checked = 0 for row in rdr: + # arbitrary number of rows to check, to keep it sane if checked > 20: break @@ -413,27 +409,45 @@ def has_header(self, sample): if len(row) != columns: continue # skip rows that have irregular number of columns + #checking if all col are strings + for cols in list(columnTypes.keys()): + if row[cols].isnumeric(): + col_are_strings = False + break + for col in list(columnTypes.keys()): thisType = complex + + try: thisType(row[col]) + except (ValueError, OverflowError): # fallback to length of string thisType = len(row[col]) - + + if thisType != columnTypes[col]: if columnTypes[col] is None: # add new column type columnTypes[col] = thisType + average_size += len(row[col]) else: # type is inconsistent, remove column from # consideration del columnTypes[col] + + + # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 + # here we added the special case where all cols are strings and dictionnary has been emptied + if not columnTypes and col_are_strings==True and columns>0: + #dictionary now takes the average length of strings + columnTypes[0] = int(average_size/columns) for col, colType in columnTypes.items(): - if isinstance(colType, int): # it's a length + if type(colType) == type(0): # it's a length if len(header[col]) != colType: hasHeader += 1 else: From e2a76d9006c522e37e75a8658e0b3f259b619f19 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Fri, 7 Apr 2023 13:20:46 +0200 Subject: [PATCH 02/44] adding genericalias --- Lib/csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 1e9483960010a5..19fb9a94b3dec3 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -4,6 +4,7 @@ """ import re +import types from _csv import Error, __version__, writer, reader, register_dialect, \ unregister_dialect, get_dialect, list_dialects, \ field_size_limit, \ @@ -126,7 +127,7 @@ def __next__(self): d[key] = self.restval return d - + __class_getitem__ = classmethod(types.GenericAlias) class DictWriter: def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds): From 02b645dea145190bedff497dde4e28666b8d7012 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Fri, 7 Apr 2023 13:43:28 +0200 Subject: [PATCH 03/44] adding fieldnames --- Lib/csv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Lib/csv.py b/Lib/csv.py index 19fb9a94b3dec3..112e978addbcef 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -81,6 +81,8 @@ class unix_dialect(Dialect): class DictReader: def __init__(self, f, fieldnames=None, restkey=None, restval=None, dialect="excel", *args, **kwds): + if fieldnames is not None and iter(fieldnames) is fieldnames: + fieldnames = list(fieldnames) self._fieldnames = fieldnames # list of keys for the dict self.restkey = restkey # key to catch long rows self.restval = restval # default value for short rows From 6db356fff252f92e03014ba06129dd98f7174b83 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Fri, 7 Apr 2023 14:50:40 +0200 Subject: [PATCH 04/44] correcting deletions --- Lib/csv.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 112e978addbcef..f818b3dfdfaa55 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -133,6 +133,8 @@ def __next__(self): class DictWriter: def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds): + if fieldnames is not None and iter(fieldnames) is fieldnames: + fieldnames = list(fieldnames) self.fieldnames = fieldnames # list of keys for the dict self.restval = restval # for writing short dicts if extrasaction.lower() not in ("raise", "ignore"): @@ -158,7 +160,7 @@ def writerow(self, rowdict): def writerows(self, rowdicts): return self.writer.writerows(map(self._dict_to_list, rowdicts)) - + __class_getitem__ = classmethod(types.GenericAlias) # Guard Sniffer's type checking against builds that exclude complex() try: complex From 2fe30c0c9d961cfc2230d9fc1a1ce69262a09476 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Tue, 11 Apr 2023 23:40:04 +0200 Subject: [PATCH 05/44] correction of comments --- Lib/csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index f818b3dfdfaa55..d8bde70be7181b 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -414,7 +414,7 @@ def has_header(self, sample): if len(row) != columns: continue # skip rows that have irregular number of columns - #checking if all col are strings + # checking if all col are strings for cols in list(columnTypes.keys()): if row[cols].isnumeric(): col_are_strings = False @@ -447,7 +447,7 @@ def has_header(self, sample): # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 - # here we added the special case where all cols are strings and dictionnary has been emptied + # here we added the special case where all cols are strings and dictionary has been emptied if not columnTypes and col_are_strings==True and columns>0: #dictionary now takes the average length of strings columnTypes[0] = int(average_size/columns) From b78cd788a85caffd9d96610cf434e34785c2df98 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Wed, 12 Apr 2023 00:12:06 +0200 Subject: [PATCH 06/44] corrections --- Lib/csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index d8bde70be7181b..06de888ad40f7c 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -449,10 +449,10 @@ def has_header(self, sample): hasHeader = 0 # here we added the special case where all cols are strings and dictionary has been emptied if not columnTypes and col_are_strings==True and columns>0: - #dictionary now takes the average length of strings + # dictionary now takes the average length of strings columnTypes[0] = int(average_size/columns) for col, colType in columnTypes.items(): - if type(colType) == type(0): # it's a length + if type(colType) is int: # it's a length if len(header[col]) != colType: hasHeader += 1 else: From 9ce92bae44931c2084f96d48dc746a1d201d40d7 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Wed, 12 Apr 2023 00:30:08 +0200 Subject: [PATCH 07/44] adding the deleted line (if isinstance(colType, int): # it's a length) --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 06de888ad40f7c..9a516dd8e4b08b 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -452,7 +452,7 @@ def has_header(self, sample): # dictionary now takes the average length of strings columnTypes[0] = int(average_size/columns) for col, colType in columnTypes.items(): - if type(colType) is int: # it's a length + if isinstance(colType, int): # it's a length if len(header[col]) != colType: hasHeader += 1 else: From 8ae2789de85a1827c9be7350e90dbeac23ed320f Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 19:33:13 +0200 Subject: [PATCH 08/44] correcting... --- Lib/csv.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 9fd10c4ba54639..4b3eb7c37d1146 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -132,6 +132,8 @@ def __next__(self): return d __class_getitem__ = classmethod(types.GenericAlias) + + class DictWriter: def __init__(self, f, fieldnames, restval="", extrasaction="raise", dialect="excel", *args, **kwds): @@ -139,7 +141,8 @@ def __init__(self, f, fieldnames, restval="", extrasaction="raise", fieldnames = list(fieldnames) self.fieldnames = fieldnames # list of keys for the dict self.restval = restval # for writing short dicts - if extrasaction.lower() not in ("raise", "ignore"): + extrasaction = extrasaction.lower() + if extrasaction not in ("raise", "ignore"): raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" % extrasaction) self.extrasaction = extrasaction @@ -162,12 +165,9 @@ def writerow(self, rowdict): def writerows(self, rowdicts): return self.writer.writerows(map(self._dict_to_list, rowdicts)) + __class_getitem__ = classmethod(types.GenericAlias) -# Guard Sniffer's type checking against builds that exclude complex() -try: - complex -except NameError: - complex = float + class Sniffer: ''' @@ -400,14 +400,15 @@ def has_header(self, sample): rdr = reader(StringIO(sample), self.sniff(sample)) header = next(rdr) # assume first row is header + columns = len(header) columnTypes = {} average_size = 0 col_are_strings = True for i in range(columns): columnTypes[i] = None + checked = 0 for row in rdr: - # arbitrary number of rows to check, to keep it sane if checked > 20: break @@ -424,8 +425,6 @@ def has_header(self, sample): for col in list(columnTypes.keys()): thisType = complex - - try: thisType(row[col]) @@ -443,9 +442,6 @@ def has_header(self, sample): # consideration del columnTypes[col] - - - # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 From 1c2fef790209455cf03af7008b015f5117bd3bf9 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:27:10 +0200 Subject: [PATCH 09/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 4b3eb7c37d1146..2ddab406518938 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -398,7 +398,6 @@ def has_header(self, sample): # subtracting from the likelihood of the first row being a header. rdr = reader(StringIO(sample), self.sniff(sample)) - header = next(rdr) # assume first row is header columns = len(header) From ba6083c1326c3696961400c39ae7acca834b4c8f Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:27:21 +0200 Subject: [PATCH 10/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 2ddab406518938..ddeaa4197d4c59 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -399,7 +399,6 @@ def has_header(self, sample): rdr = reader(StringIO(sample), self.sniff(sample)) header = next(rdr) # assume first row is header - columns = len(header) columnTypes = {} average_size = 0 From 2b8c27e5365fbf2fe9d89c6c7ca7ce07f8ddd458 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:27:33 +0200 Subject: [PATCH 11/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index ddeaa4197d4c59..81244c46fd983f 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -404,7 +404,6 @@ def has_header(self, sample): average_size = 0 col_are_strings = True for i in range(columns): columnTypes[i] = None - checked = 0 for row in rdr: # arbitrary number of rows to check, to keep it sane From 7ba8c83aa281970e90c8f84b8570d9b43361778b Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:27:42 +0200 Subject: [PATCH 12/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 81244c46fd983f..5e1b0b276dba16 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -415,7 +415,7 @@ def has_header(self, sample): continue # skip rows that have irregular number of columns # checking if all col are strings - for cols in list(columnTypes.keys()): + for cols in columnTypes: if row[cols].isnumeric(): col_are_strings = False break From 92f63f6defe44ed46e142cf8583e52eb3edd4803 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:27:51 +0200 Subject: [PATCH 13/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 5e1b0b276dba16..037ce6e913e0bb 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -424,7 +424,6 @@ def has_header(self, sample): thisType = complex try: thisType(row[col]) - except (ValueError, OverflowError): # fallback to length of string thisType = len(row[col]) From a1d84d61b6dfebfa4f6d2e0879075a40814f0f65 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:28:16 +0200 Subject: [PATCH 14/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 037ce6e913e0bb..85ead09f1a77a6 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -427,8 +427,6 @@ def has_header(self, sample): except (ValueError, OverflowError): # fallback to length of string thisType = len(row[col]) - - if thisType != columnTypes[col]: if columnTypes[col] is None: # add new column type columnTypes[col] = thisType From ca440f7f3f1f78acacc3f1eb6403abc830ad8632 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:28:28 +0200 Subject: [PATCH 15/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 85ead09f1a77a6..21600590513e78 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -435,7 +435,6 @@ def has_header(self, sample): # type is inconsistent, remove column from # consideration del columnTypes[col] - # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 From f60bc70ffc99aab3608f8ab2dc72943233291639 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:28:41 +0200 Subject: [PATCH 16/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 21600590513e78..129e179e27f49c 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -438,7 +438,8 @@ def has_header(self, sample): # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 - # here we added the special case where all cols are strings and dictionary has been emptied + + # special case when all columns are strings and columnTypes has been emptied if not columnTypes and col_are_strings==True and columns>0: # dictionary now takes the average length of strings columnTypes[0] = int(average_size/columns) From 22d8b558f646ca28d1bf776343610ea2b2bb99ea Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:28:54 +0200 Subject: [PATCH 17/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 129e179e27f49c..89afd762e45d18 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -440,7 +440,7 @@ def has_header(self, sample): hasHeader = 0 # special case when all columns are strings and columnTypes has been emptied - if not columnTypes and col_are_strings==True and columns>0: + if not columnTypes and col_are_strings and columns > 0: # dictionary now takes the average length of strings columnTypes[0] = int(average_size/columns) for col, colType in columnTypes.items(): From 285255481a7b83d21f4ebc1e9b31cf16066ff25b Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:29:06 +0200 Subject: [PATCH 18/44] Update Lib/csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/csv.py b/Lib/csv.py index 89afd762e45d18..9666dbfd25436c 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -443,6 +443,7 @@ def has_header(self, sample): if not columnTypes and col_are_strings and columns > 0: # dictionary now takes the average length of strings columnTypes[0] = int(average_size/columns) + for col, colType in columnTypes.items(): if isinstance(colType, int): # it's a length if len(header[col]) != colType: From 1a9ed20ec45a82c6af26feb76ba57ad1055f0c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89ric?= Date: Mon, 1 May 2023 14:34:39 -0400 Subject: [PATCH 19/44] restore blank lines --- Lib/csv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/csv.py b/Lib/csv.py index 9666dbfd25436c..351857de84724a 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -398,12 +398,15 @@ def has_header(self, sample): # subtracting from the likelihood of the first row being a header. rdr = reader(StringIO(sample), self.sniff(sample)) + header = next(rdr) # assume first row is header + columns = len(header) columnTypes = {} average_size = 0 col_are_strings = True for i in range(columns): columnTypes[i] = None + checked = 0 for row in rdr: # arbitrary number of rows to check, to keep it sane @@ -427,6 +430,7 @@ def has_header(self, sample): except (ValueError, OverflowError): # fallback to length of string thisType = len(row[col]) + if thisType != columnTypes[col]: if columnTypes[col] is None: # add new column type columnTypes[col] = thisType From d381110836f7c4df55e9b42652a58d36a4712ae1 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 1 May 2023 20:36:16 +0200 Subject: [PATCH 20/44] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 351857de84724a..5a8d1619e69476 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -446,7 +446,7 @@ def has_header(self, sample): # special case when all columns are strings and columnTypes has been emptied if not columnTypes and col_are_strings and columns > 0: # dictionary now takes the average length of strings - columnTypes[0] = int(average_size/columns) + columnTypes[0] = average_size // columns for col, colType in columnTypes.items(): if isinstance(colType, int): # it's a length From 3384c991be5f7d23b27936dc19d42c0c14ad7bed Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 1 May 2023 18:53:21 +0000 Subject: [PATCH 21/44] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst diff --git a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst new file mode 100644 index 00000000000000..aa62b243515d21 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst @@ -0,0 +1,2 @@ +Added in the has_header() method, a condition that checks if all elements are strings type. + If it's true (in that case, the dictionnary is empty), then we compare the average length of all strings to the header's length in the vote at the end of the function. From d167d6de6068b63ac9902aad80cdd5777edcdc95 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sat, 6 May 2023 17:55:45 +0200 Subject: [PATCH 22/44] test for issue #102140 --- Lib/test/test_csv.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 8fb97bc0c1a1a7..b1e81349320836 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1188,6 +1188,18 @@ class TestSniffer(unittest.TestCase): abc\0def ghijkl\0mno ghi\0jkl +""" + sample15 = """\ + sample,fastq_1,fastq_2 +A1-35-8,/mnt/scratch/sarek/data/A1-35-8/A1-35-8_R1.fastq.gz,/mnt/scratch/sarek/data/A1-35-8/A1-35-8_R2.fastq.gz +A2-102-5,/mnt/scratch/sarek/data/A2-102-5/A2-102-5_R1.fastq.gz,/mnt/scratch/sarek/data/A2-102-5/A2-102-5_R2.fastq.gz +A5-35-17,/mnt/scratch/sarek/data/A5-35-17/A5-35-17_R1.fastq.gz,/mnt/scratch/sarek/data/A5-35-17/A5-35-17_R2.fastq.gz +AD1-7a,/mnt/scratch/sarek/data/AD1-7a/AD1-7a_R1.fastq.gz,/mnt/scratch/sarek/data/AD1-7a/AD1-7a_R2.fastq.gz +AD1-83a,/mnt/scratch/sarek/data/AD1-83a/AD1-83a_R1.fastq.gz,/mnt/scratch/sarek/data/AD1-83a/AD1-83a_R2.fastq.gz +AD2-60a,/mnt/scratch/sarek/data/AD2-60a/AD2-60a_R1.fastq.gz,/mnt/scratch/sarek/data/AD2-60a/AD2-60a_R2.fastq.gz +Arg1366,/mnt/scratch/sarek/data/Arg1366/Arg1366_R1.fastq.gz,/mnt/scratch/sarek/data/Arg1366/Arg1366_R2.fastq.gz +Br795,/mnt/scratch/sarek/data/Br795/Br795_R1.fastq.gz,/mnt/scratch/sarek/data/Br795/Br795_R2.fastq.gz +Bt100,/mnt/scratch/sarek/data/Bt100/Bt100_R1.fastq.gz,/mnt/scratch/sarek/data/Bt100/Bt100_R2.fastq.gz """ def test_issue43625(self): @@ -1200,6 +1212,7 @@ def test_has_header_strings(self): sniffer = csv.Sniffer() self.assertFalse(sniffer.has_header(self.sample10)) self.assertFalse(sniffer.has_header(self.sample11)) + self.assertTrue(sniffer.has_header(self.sample15)) def test_has_header(self): sniffer = csv.Sniffer() From 17d4a7ec236fe2c5b055ca11b31fe2a39e195285 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sat, 6 May 2023 18:09:11 +0200 Subject: [PATCH 23/44] adding space --- Lib/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/csv.py b/Lib/csv.py index 5a8d1619e69476..81fb2220f77a6a 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -439,6 +439,7 @@ def has_header(self, sample): # type is inconsistent, remove column from # consideration del columnTypes[col] + # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 From 95c1dae750fda75b8e396f07f0049fae10a4067a Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sat, 6 May 2023 22:15:15 +0200 Subject: [PATCH 24/44] patch check correction --- Lib/csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 81fb2220f77a6a..690939a1627fd0 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -403,7 +403,7 @@ def has_header(self, sample): columns = len(header) columnTypes = {} - average_size = 0 + average_size = 0 col_are_strings = True for i in range(columns): columnTypes[i] = None @@ -439,7 +439,7 @@ def has_header(self, sample): # type is inconsistent, remove column from # consideration del columnTypes[col] - + # finally, compare results against first row and "vote" # on whether it's a header hasHeader = 0 From eb80cbb98ea85d55960813b3a596c830a7ffc577 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Wed, 10 May 2023 16:07:18 +0200 Subject: [PATCH 25/44] rephrase comment line 449 --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 690939a1627fd0..77add9fc8a94cf 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -446,7 +446,7 @@ def has_header(self, sample): # special case when all columns are strings and columnTypes has been emptied if not columnTypes and col_are_strings and columns > 0: - # dictionary now takes the average length of strings + # If there are only columns of strings and no column types specified, we update the dictionary to store the average length of strings columnTypes[0] = average_size // columns for col, colType in columnTypes.items(): From 3c62566281b4670e794923b63a70acf0cc9fa691 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Wed, 10 May 2023 16:35:38 +0200 Subject: [PATCH 26/44] doc in csv.rst improved for has_header function --- Doc/library/csv.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 64baa69be4af31..40b4f9d4867454 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -294,6 +294,12 @@ The :mod:`csv` module defines the following classes: Twenty rows after the first row are sampled; if more than half of columns + rows meet the criteria, :const:`True` is returned. + + To improve the accuracy of the heuristic, it will check whether all columns + are strings and no column types are specified. + If this condition is satisfied, it will compute the average length of strings + across all columns and stores this value in the columnTypes dictionary with a key of 0. + This information is used to estimate if the first row is a header. .. note:: From e1f3e1b21977969afebe5e9e52115e4e883f4b33 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Wed, 10 May 2023 17:07:45 +0200 Subject: [PATCH 27/44] delete whitespace in csv.rst --- Doc/library/csv.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 40b4f9d4867454..cdba99cdb52c86 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -294,7 +294,7 @@ The :mod:`csv` module defines the following classes: Twenty rows after the first row are sampled; if more than half of columns + rows meet the criteria, :const:`True` is returned. - + To improve the accuracy of the heuristic, it will check whether all columns are strings and no column types are specified. If this condition is satisfied, it will compute the average length of strings From d01e7a4e1667817ae1c8f2e3858d20a96f83e58f Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Wed, 10 May 2023 17:29:02 +0200 Subject: [PATCH 28/44] delete the final whitespaces --- Doc/library/csv.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index cdba99cdb52c86..6565e7de2d7954 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -294,7 +294,7 @@ The :mod:`csv` module defines the following classes: Twenty rows after the first row are sampled; if more than half of columns + rows meet the criteria, :const:`True` is returned. - + To improve the accuracy of the heuristic, it will check whether all columns are strings and no column types are specified. If this condition is satisfied, it will compute the average length of strings From 42c00e1c748f2d6c5a171f2096b80465b0974db7 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Sat, 13 May 2023 22:26:00 +0200 Subject: [PATCH 29/44] merged the 2 loops line 400 --- Lib/csv.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index 77add9fc8a94cf..24a33a09d22a4e 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -417,13 +417,11 @@ def has_header(self, sample): if len(row) != columns: continue # skip rows that have irregular number of columns - # checking if all col are strings - for cols in columnTypes: - if row[cols].isnumeric(): + for col in list(columnTypes.keys()): + # checking if all col are strings + if row[col].isnumeric(): col_are_strings = False - break - for col in list(columnTypes.keys()): thisType = complex try: thisType(row[col]) From bffbcf76ec4ffe867db0a02862b6c07af5420f0f Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 15 May 2023 16:15:24 +0200 Subject: [PATCH 30/44] Update Lib/test/test_csv.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- Lib/test/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index a4c23e2ea32e37..d45fc20819bb0c 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -1190,7 +1190,7 @@ class TestSniffer(unittest.TestCase): ghi\0jkl """ sample15 = """\ - sample,fastq_1,fastq_2 +sample,fastq_1,fastq_2 A1-35-8,/mnt/scratch/sarek/data/A1-35-8/A1-35-8_R1.fastq.gz,/mnt/scratch/sarek/data/A1-35-8/A1-35-8_R2.fastq.gz A2-102-5,/mnt/scratch/sarek/data/A2-102-5/A2-102-5_R1.fastq.gz,/mnt/scratch/sarek/data/A2-102-5/A2-102-5_R2.fastq.gz A5-35-17,/mnt/scratch/sarek/data/A5-35-17/A5-35-17_R1.fastq.gz,/mnt/scratch/sarek/data/A5-35-17/A5-35-17_R2.fastq.gz From b3d0bf2685783e3c3487e18ab9f554294a3f97ba Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 15 May 2023 16:31:19 +0200 Subject: [PATCH 31/44] update csv.py --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 24a33a09d22a4e..1f5d37b22573ad 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -444,7 +444,7 @@ def has_header(self, sample): # special case when all columns are strings and columnTypes has been emptied if not columnTypes and col_are_strings and columns > 0: - # If there are only columns of strings and no column types specified, we update the dictionary to store the average length of strings + # If there are only columns of strings and no column types specified, we update the dictionary to store the average length of all strings columnTypes[0] = average_size // columns for col, colType in columnTypes.items(): From 0ab2649859e86aa4f71b313446c58b18f70394c2 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sun, 28 May 2023 00:36:25 +0200 Subject: [PATCH 32/44] updating csv.rst --- Doc/library/csv.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 6565e7de2d7954..4c50929cc575e9 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -295,11 +295,10 @@ The :mod:`csv` module defines the following classes: Twenty rows after the first row are sampled; if more than half of columns + rows meet the criteria, :const:`True` is returned. - To improve the accuracy of the heuristic, it will check whether all columns - are strings and no column types are specified. - If this condition is satisfied, it will compute the average length of strings - across all columns and stores this value in the columnTypes dictionary with a key of 0. - This information is used to estimate if the first row is a header. + Additionally, if all columns are found to be strings and :py:obj:`columnTypes` + is empty, the method assigns the medium length of all the strings + to the dictionary. + .. note:: From 5aabf604a25394bef4d5a1dc3488febf88834598 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sun, 28 May 2023 14:07:09 +0200 Subject: [PATCH 33/44] whatsnew update --- Doc/whatsnew/3.13.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 45728d1801d20a..dd2619cc23adcc 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -87,6 +87,14 @@ New Modules Improved Modules ================ +csv +--- + +* Improved ``has_header()`` method in CSV module to provide better header detection. +It now checks if all columns are strings and if the ``columnTypes`` dictionary is empty. +If these conditions are met, the function assigns the median length of the strings. +(Contributed by Vanille-22 & Drakariboo in :gh:`103341`.) + Optimizations ============= From 61dfb3efcdd78e0e998b37db30199c0782b2bf1e Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sun, 28 May 2023 14:16:21 +0200 Subject: [PATCH 34/44] whatsnew update --- Doc/whatsnew/3.13.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index dd2619cc23adcc..bc488b38514533 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -90,9 +90,10 @@ Improved Modules csv --- -* Improved ``has_header()`` method in CSV module to provide better header detection. -It now checks if all columns are strings and if the ``columnTypes`` dictionary is empty. -If these conditions are met, the function assigns the median length of the strings. +* Improved ``has_header()`` method in CSV module to provide better +header detection. It now checks if all columns are strings and if the +``columnTypes`` dictionary is empty. If these conditions are met, +the function assigns the median length of the strings. (Contributed by Vanille-22 & Drakariboo in :gh:`103341`.) From 34e52faa7df7713a7e7e39be17f1eae5491df733 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Sun, 28 May 2023 14:21:39 +0200 Subject: [PATCH 35/44] whatsnew update --- Doc/whatsnew/3.13.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index bc488b38514533..f77b717b66a346 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -90,12 +90,11 @@ Improved Modules csv --- -* Improved ``has_header()`` method in CSV module to provide better -header detection. It now checks if all columns are strings and if the -``columnTypes`` dictionary is empty. If these conditions are met, -the function assigns the median length of the strings. -(Contributed by Vanille-22 & Drakariboo in :gh:`103341`.) - +* Improved the ``has_header()`` method in the CSV module to provide better + header detection. It now checks if all columns are strings and if the + ``columnTypes`` dictionary is empty. If these conditions are met, + the function assigns the median length of the strings. + (Contributed by Vanille-22 & Drakariboo in :gh:`103341`.) Optimizations ============= From d0f33be4c8f59cfa9ca7948b1d21fb6ecc1c7a6c Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Mon, 29 May 2023 21:59:45 +0200 Subject: [PATCH 36/44] resolve conflict --- Doc/whatsnew/3.13.rst | 8 -------- 1 file changed, 8 deletions(-) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index f77b717b66a346..45728d1801d20a 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -87,14 +87,6 @@ New Modules Improved Modules ================ -csv ---- - -* Improved the ``has_header()`` method in the CSV module to provide better - header detection. It now checks if all columns are strings and if the - ``columnTypes`` dictionary is empty. If these conditions are met, - the function assigns the median length of the strings. - (Contributed by Vanille-22 & Drakariboo in :gh:`103341`.) Optimizations ============= From 3e2b80693aa545d792d29fe5e7211ddc833ec6e0 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Tue, 30 May 2023 00:37:44 +0200 Subject: [PATCH 37/44] misc/news update --- .../Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst index aa62b243515d21..347bba83ca4b0c 100644 --- a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst +++ b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst @@ -1,2 +1,3 @@ -Added in the has_header() method, a condition that checks if all elements are strings type. - If it's true (in that case, the dictionnary is empty), then we compare the average length of all strings to the header's length in the vote at the end of the function. +Enhanced the :meth:`has_header` in the :mod:`CSV` for better header detection. +Now checks if all columns are strings and if the :py:data:`columnTypes` is empty. +If conditions are met, it assigns the median length of the strings to :py:data:`columnTypes`. From b77dc4c2abc3e60e7c600110a92ee9b73ac8d355 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Tue, 30 May 2023 00:48:44 +0200 Subject: [PATCH 38/44] Update Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric --- .../Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst index 347bba83ca4b0c..b7d366f41fd53b 100644 --- a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst +++ b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst @@ -1,3 +1 @@ -Enhanced the :meth:`has_header` in the :mod:`CSV` for better header detection. -Now checks if all columns are strings and if the :py:data:`columnTypes` is empty. -If conditions are met, it assigns the median length of the strings to :py:data:`columnTypes`. +Improve detection in the :meth:`~csv.Sniffer.has_header` method of :class:`csv.Sniffer` when all cells in the first row are strings. From bb136bb99cf686806d753211edf6dff627e63d07 Mon Sep 17 00:00:00 2001 From: Vanille <100017864+Vanille-22@users.noreply.github.com> Date: Tue, 30 May 2023 01:54:10 +0200 Subject: [PATCH 39/44] adding a comment on csv.py & csv.rst update --- Doc/library/csv.rst | 6 +++--- Lib/csv.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Doc/library/csv.rst b/Doc/library/csv.rst index 4c50929cc575e9..afaf464dbcf420 100644 --- a/Doc/library/csv.rst +++ b/Doc/library/csv.rst @@ -295,9 +295,9 @@ The :mod:`csv` module defines the following classes: Twenty rows after the first row are sampled; if more than half of columns + rows meet the criteria, :const:`True` is returned. - Additionally, if all columns are found to be strings and :py:obj:`columnTypes` - is empty, the method assigns the medium length of all the strings - to the dictionary. + Additionally, if all columns are found to be strings and have varying + lengths, the average length of all the strings becomes a crucial factor + in the determination process. .. note:: diff --git a/Lib/csv.py b/Lib/csv.py index 1f5d37b22573ad..2e23be9071b794 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -394,6 +394,8 @@ def has_header(self, sample): # can't be determined, it is assumed to be a string in which case # the length of the string is the determining factor: if all of the # rows except for the first are the same length, it's a header. + # when the strings have varying length, the average length of all + # strings becomes a determining factor. # Finally, a 'vote' is taken at the end for each column, adding or # subtracting from the likelihood of the first row being a header. From 9686653993f2b80bba952a883ebb2c42db743bd7 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:37:20 +0200 Subject: [PATCH 40/44] Update Lib/csv.py Change line 397 : "w" in uppercase. Co-authored-by: C.A.M. Gerlach --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index 2e23be9071b794..f1c6d59fb934aa 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -394,7 +394,7 @@ def has_header(self, sample): # can't be determined, it is assumed to be a string in which case # the length of the string is the determining factor: if all of the # rows except for the first are the same length, it's a header. - # when the strings have varying length, the average length of all + # When the strings have varying length, the average length of all # strings becomes a determining factor. # Finally, a 'vote' is taken at the end for each column, adding or # subtracting from the likelihood of the first row being a header. From 680c67b45d233d94eb5203090c2fd424d4ea06a3 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:39:41 +0200 Subject: [PATCH 41/44] Update Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst Rewording to specify this is more a defect fix than an enhancement Co-authored-by: C.A.M. Gerlach --- .../next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst index b7d366f41fd53b..88e89d9fcf943d 100644 --- a/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst +++ b/Misc/NEWS.d/next/Library/2023-05-01-18-53-20.gh-issue-102140._4gFLu.rst @@ -1 +1 @@ -Improve detection in the :meth:`~csv.Sniffer.has_header` method of :class:`csv.Sniffer` when all cells in the first row are strings. +Fix false negatives in the :meth:`~csv.Sniffer.has_header` method of :class:`csv.Sniffer` when all cells in the first row are strings. From 149938a131323150e099439d85e88f8adb4c2429 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:40:30 +0200 Subject: [PATCH 42/44] Update Lib/csv.py Replacing "checking" by "check" in the comments Co-authored-by: C.A.M. Gerlach --- Lib/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index f1c6d59fb934aa..c7f72be1953041 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -420,7 +420,7 @@ def has_header(self, sample): continue # skip rows that have irregular number of columns for col in list(columnTypes.keys()): - # checking if all col are strings + # check if all col are strings if row[col].isnumeric(): col_are_strings = False From 4b22c7735ef378bb371eeabfb703e5822a1f0ee4 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:42:06 +0200 Subject: [PATCH 43/44] Update Lib/csv.py Change comments to keep a more reasonable line length and use imperative. Co-authored-by: C.A.M. Gerlach --- Lib/csv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/csv.py b/Lib/csv.py index c7f72be1953041..cb2319955b3116 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -446,7 +446,8 @@ def has_header(self, sample): # special case when all columns are strings and columnTypes has been emptied if not columnTypes and col_are_strings and columns > 0: - # If there are only columns of strings and no column types specified, we update the dictionary to store the average length of all strings + # If there are only columns of strings and no column types specified, + # update the dictionary to store the average length of all strings columnTypes[0] = average_size // columns for col, colType in columnTypes.items(): From 20921738724e55e3c73a3e59e674db0f081928c7 Mon Sep 17 00:00:00 2001 From: Drakariboo <108684103+Drakariboo@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:45:28 +0200 Subject: [PATCH 44/44] Update Lib/csv.py change lines 407-410, init and assignment columnTypes directly. Co-authored-by: C.A.M. Gerlach --- Lib/csv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Lib/csv.py b/Lib/csv.py index cb2319955b3116..e58160569e91fa 100644 --- a/Lib/csv.py +++ b/Lib/csv.py @@ -404,10 +404,9 @@ def has_header(self, sample): header = next(rdr) # assume first row is header columns = len(header) - columnTypes = {} + columnTypes = {i: None for i in range(columns)} average_size = 0 col_are_strings = True - for i in range(columns): columnTypes[i] = None checked = 0 for row in rdr: