Coverage for /var/devmt/py/utils4_1.5.0rc1/utils4/reader.py: 100%

42 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-12 15:38 +0100

1#!/usr/bin/env python 

2# -*- coding: utf-8 -*- 

3""" 

4:Purpose: This module contains purpose-built data readers for formats 

5 which are no longer supported, namely: 

6 

7 - **.xls:** pre-Excel 5.0/95 Workbook 

8 

9:Platform: Linux/Windows | Python 3.6+ 

10:Developer: J Berendt 

11:Email: development@s3dev.uk 

12 

13:Comments: n/a 

14 

15:Example: 

16 

17 Example for reading an old-style .xls (pre-Excel 5.0/95) Workbook into a 

18 DataFrame:: 

19 

20 >>> from utils4.reader import reader 

21 >>> df = reader.read_xls('/path/to/file.xls') 

22 

23""" 

24# pylint: disable=invalid-name 

25# pylint: disable=wrong-import-order 

26 

27import io 

28import os 

29import pandas as pd 

30import xlrd 

31from datetime import datetime as dt 

32 

33 

34class Reader: 

35 """Class wrapper for various data reading methods. 

36 

37 For details on each reader, refer to the docstring for that reader. 

38 

39 """ 

40 

41 def read_xls(self, 

42 filepath: str, 

43 encoding: str=None, 

44 sheet_index: int=0, 

45 skiprows: int=0, 

46 skipcols: int=0, 

47 chopcols: int=0, 

48 date_formats: dict=None, 

49 errors: str='coerce', 

50 fill_date_errors: bool=False) -> pd.DataFrame: 

51 """Read a pre-Excel 5.0/95 .XLS file into a DataFrame. 

52 

53 This function is designed to deal with *old* XLS files which 

54 the ``pandas.read_excel`` function *does not support*. 

55 

56 Args: 

57 filepath (str): Full path to the file to be read. 

58 encoding (str, optional): Encoding used to read the XLS file. 

59 Defaults to None. 

60 sheet_index (int, optional): Index of the sheet to be read, 

61 zero-based. Defaults to 0. 

62 skiprows (int, optional): Number of rows to skip (from the 

63 beginning of the file). Defaults to 0. 

64 skipcols (int, optional): Number of columns to skip (from the left). 

65 Defaults to 0. 

66 chopcols (int, optional): Number of columns to skip/chop (from the 

67 right). Defaults to 0. 

68 date_formats (dict, optional): Dictionary of 

69 ``{col_name: strftime_mask}``. Defaults to None. 

70 errors (str, optional): Method used by :func:`~pandas.read_csv` to 

71 resolve date parsing errors. Defaults to 'coerce'. 

72 fill_date_errors (bool, optional): Fill coerced NaT date errors 

73 with '1900-01-01'. Defaults to False. 

74 

75 :Logic: 

76 The passed XLS file is opened and parsed by the ``xlrd`` library, 

77 then read into an in-memory stream buffer, which is 

78 passed into ``pandas.read_csv`` function for conversion to a 

79 DataFrame. 

80 

81 Raises: 

82 ValueError: If the file extension is not ``.xls``. 

83 IOError: If the workbook does not contain any rows of data. 

84 

85 Returns: 

86 df (pd.DataFrame): A DataFrame containing the contents of 

87 the XLS file. 

88 

89 """ 

90 if os.path.splitext(filepath)[1].lower() != '.xls': 

91 raise ValueError('The file *must* be an XLS file.') 

92 chopcols = -chopcols if chopcols else None 

93 stream = io.StringIO(newline='\n') 

94 wb = xlrd.open_workbook(filepath, encoding_override=encoding, formatting_info=True) 

95 ws = wb.sheet_by_index(sheet_index) 

96 if not ws.nrows: 

97 raise IOError('This workbook does not contain any rows of data.') 

98 rows = ws.get_rows() 

99 if skiprows: 

100 for _ in range(skiprows): 

101 next(rows) 

102 for r in rows: 

103 row = r[skipcols:chopcols] 

104 # Ensure xldate formats are parsed correctly. 

105 data = self._extract_row(row=row) 

106 stream.write(data + '\n') 

107 _ = stream.seek(0) 

108 df = pd.read_csv(stream) 

109 if date_formats: 

110 for col, fmt in date_formats.items(): 

111 df[col] = pd.to_datetime(df[col], format=fmt, errors=errors) 

112 if fill_date_errors: 

113 # Changed to remove inplace=True due to pandas v3.0 deprecation warnings. 

114 df[col] = df[col].fillna(dt(1900,1,1)) 

115 stream.close() 

116 return df 

117 

118 @staticmethod 

119 def _extract_row(row: iter) -> str: 

120 """Extract and parse each row. 

121 

122 Args: 

123 row (iter): Iterable object which is converted into a string, 

124 separated by the separator specified by the ``sep`` argument. 

125 sep (str, optional): Separator character. Defaults to ``','``. 

126 

127 Returns: 

128 str: A string containing all row values, separated by the ``sep`` 

129 character. 

130 

131 """ 

132 def _value_generator(row: iter) -> str: 

133 """Parse each row value based on its ``xf_index`` value. 

134 

135 Args: 

136 row (iter): Iterable object. 

137 

138 Yields: 

139 str: Each parsed value from the iterable. 

140 

141 """ 

142 for i in row: 

143 if i.xf_index == 62: 

144 val = xlrd.xldate.xldate_as_datetime(i.value, 0) 

145 else: 

146 val = i.value 

147 yield str(val) 

148 return ','.join(_value_generator(row=row)) 

149 

150 

151reader = Reader()