import os
import ntpath
import numpy as np
import pandas as pd
import openpyxl
import xlrd
from .helpers import check_valid_xls
from .sniffer import XLSSniffer
#******************************************************************
# convertor
#******************************************************************
[docs]class XLStoCSVMultiFile(object):
"""
Converts xls|xlsx files to csv files. Selects a SINGLE SHEET from each file. To extract MULTIPLE SHEETS from a file use XLStoCSVMultiSheet
Args:
fname_list (list): file paths, eg ['dir/a.csv','dir/b.csv']
cfg_xls_sheets_sel_mode (string): mode to select tabs
* ``name``: select by name, provide name for each file, can customize by file
* ``name_global``: select by name, one name for all files
* ``idx``: select by index, provide index for each file, can customize by file
* ``idx_global``: select by index, one index for all files
cfg_xls_sheets_sel (list): values to select tabs **NEEDS TO BE IN THE SAME ORDER AS `fname_list`**
logger (object): logger object with send_log(), optional
"""
def __init__(self, fname_list, cfg_xls_sheets_sel_mode, cfg_xls_sheets_sel, logger=None):
self.logger = logger
self.set_files(fname_list)
self.set_select_mode(cfg_xls_sheets_sel_mode, cfg_xls_sheets_sel)
[docs] def set_files(self, fname_list):
"""
Update input files. You will also need to update sheet selection with ``.set_select_mode()``.
Args:
fname_list (list): see class description for details
"""
self.fname_list = fname_list
self.xlsSniffer = XLSSniffer(fname_list)
[docs] def set_select_mode(self, cfg_xls_sheets_sel_mode, cfg_xls_sheets_sel):
"""
Update sheet selection values
Args:
cfg_xls_sheets_sel_mode (string): see class description for details
cfg_xls_sheets_sel (list): see class description for details
"""
assert cfg_xls_sheets_sel_mode in ['name','idx','name_global','idx_global']
sheets = self.xlsSniffer.dict_xls_sheets
if cfg_xls_sheets_sel_mode=='name_global':
cfg_xls_sheets_sel_mode = 'name'
cfg_xls_sheets_sel = dict(zip(self.fname_list,[cfg_xls_sheets_sel]*len(self.fname_list)))
elif cfg_xls_sheets_sel_mode=='idx_global':
cfg_xls_sheets_sel_mode = 'idx'
cfg_xls_sheets_sel = dict(zip(self.fname_list,[cfg_xls_sheets_sel]*len(self.fname_list)))
if not set(cfg_xls_sheets_sel.keys())==set(sheets.keys()):
raise ValueError('Need to select a sheet from every file')
# check given selection actually present in files
if cfg_xls_sheets_sel_mode=='name':
if not np.all([cfg_xls_sheets_sel[fname] in sheets[fname]['sheets_names'] for fname in self.fname_list]):
raise ValueError('Invalid sheet name selected in one of the files')
# todo show which file is mismatched
elif cfg_xls_sheets_sel_mode=='idx':
if not np.all([cfg_xls_sheets_sel[fname] <= sheets[fname]['sheets_count'] for fname in self.fname_list]):
raise ValueError('Invalid index selected in one of the files')
# todo show which file is mismatched
else:
raise ValueError('Invalid xls_sheets_mode')
self.cfg_xls_sheets_sel_mode = cfg_xls_sheets_sel_mode
self.cfg_xls_sheets_sel = cfg_xls_sheets_sel
def _convert_single(self, fname):
if self.logger:
self.logger.send_log('converting file: '+ntpath.basename(fname)+' | sheet: '+ str(self.cfg_xls_sheets_sel[fname]),'ok')
fname_out = fname+'-'+str(self.cfg_xls_sheets_sel[fname])+'.csv'
df = pd.read_excel(fname, sheet_name=self.cfg_xls_sheets_sel[fname], dtype='str')
df.to_csv(fname_out,index=False)
return fname_out
[docs] def convert_all(self):
"""
Executes conversion. Writes to the same path as file and appends .csv to filename.
Returns:
list: output file names
"""
# todo: customize output dir. customize output filename
# read files
fnames_converted = []
for fname in self.fname_list:
fname_out = self._convert_single(fname)
fnames_converted.append(fname_out)
return fnames_converted
[docs]class XLStoCSVMultiSheet(object):
"""
Converts ALL SHEETS from a SINGLE xls|xlsx files to separate csv files
Args:
fname (string): file path
logger (object): logger object with send_log()
"""
def __init__(self, fname, logger=None):
assert type(fname) is str
self.logger = logger
self.set_files(fname)
[docs] def set_files(self, fname):
self.fname = fname
self.xlsSniffer = XLSSniffer([fname,])
def _convert_single(self, fname):
return fname_out
[docs] def convert_all(self):
# todo: customize output dir
# read files
fnames_converted = []
for iSheet in self.xlsSniffer[self.fname]['sheet_name']:
if self.logger:
self.logger.send_log('sniffing sheets in '+ntpath.basename(fname),'ok')
fname_out = fname+'-'+str(iSheet)+'.csv'
df = pd.read_excel(fname, sheet_name=iSheet, dtype='str')
df.to_csv(fname_out,index=False)
fnames_converted.append(fname_out)
return fnames_converted