I'm a python beginner but I'm writing a script using openpyxl to read in big xlsx files (60000x187) into Numpy arrays to do some machine learning. My code:
from openpyxl import load_workbook
import re
from numpy import *
wb = load_workbook(filename = 'dataSheet.xlsx', use_iterators = True) #dataSheet.xlsx
ws1 = wb.get_sheet_by_name(name = 'LogFileData')
startCol = 1 #index from 1
startRow = 2 #start at least from 2 because header is in 1st row
endCol = ws1.get_highest_column() #index of last used column, from 1
endRow = ws1.get_highest_row() #index of last used row, indexed from 1
diff = endRow - startRow + 1 #number of rows in the data array
header = [] #contains the column labels
data = zeros((0,endCol), dtype=float64) #2D array that holds the data
#puts the column headers into a list
for row in ws1.get_squared_range(1, 1, endCol, 1): #indexed from 1
for cell in row:
for match in re.findall("<(.*?)>", cell.value):
header.append(match)
#indexed from 1 when using the ws1
#index from 0 when using the Numpy arrays, tempRow, tempPt, data
for index, row in enumerate(ws1.iter_rows(row_offset=1)):
tempRow = zeros((1,0), dtype=float64)
tempPt = zeros((1,1), dtype=float64)
for cell in row:
value = cell.value
if isinstance(value, basestring):
tempPt[0][0] = None
else:
tempPt[0][0]=value
tempRow = hstack((tempRow,tempPt))
data = vstack((data,tempRow))
Is openpyxl and the optimized_reader the fastest and most space efficient way to do this? A colleague mentioned that a csv file might be faster when used with itertools or a similar package.
Edit 1: My specs Ubuntu 10.04 LTS on VMWare Python 2.6.5 Intel i5 quad core 2.5GHz Windows 7 Enterprise