This is quite a common requirement. Albiet, one I would like to say no to since it is quite easy to get bad data in a spreadsheet. However, we typically don't have the power to say no. Assuming you have controls on data content quality, wouldn't it be nice to FTP the XLS file in binary mode directly to UNIX and dump the contents to a flat file programatically during the sourcing phase?
I was going to write a generic perl XLS data dumper using a format file to describe the XLS, but, alas, like everything else, that simply made the to do list and then I came down off my cloud and I quickly wrote this script to get the job done for a specific XLS. Anyway, it's quite simple and can be modified to handle your specific layout.
This is standard Perl stuff except that it uses an Excel parser module that is freeley available from CPAN. See notes in comments of script.
Of course it has directory structure etc that is specific to the architecture where I wrote this to run, but again that can easily be modified. The real work here is in the use of the module that makes this task a snap, hands off and schedulable.
Code: Select all
#!/usr/bin/perl
##############################################################################
#
# Program: ExtractXLSCommission.pl
#
# Description: See ShowBlurb function below for details
#
# Notes: This script requires some CPAN modules:
# * IO-stringy
# * OLE-Storage_Lite
# * Spreadsheet-ParseExcel
# All of these are available at http://www.cpan.org
# They musy be installed in the above order.
#
# === Modification History ===================================================
# Date Author Comments
# ---------- --------------- -------------------------------------------------
# 07-31-2003 Steve Boyce Created.
#
##############################################################################
use Getopt::Std;
use File::Basename;
use Spreadsheet::ParseExcel;
##############################################################################
sub ShowBlurb
{
print <<ENDOFBLURB;
Syntax: ExtractXLSCommission.pl -h <ParameterFile> <FilesetDate>
Description: Specific conversion script for Commission file.
Parameters: ParameterFile - Name of parameter file.
FilesetDate - Date of directory in file induction area
Format: "YYYY-MM-DD"
Options: -h This help.
Notes: commission.xls has one worksheet.
It has one header row. It has no footer rows. All non header rows
will be processed. Blank rows will be ignored without an error.
The Target extract file will always be overwritten without warning.
Progress messages and error messages are written to STD-OUT.
ENDOFBLURB
}
##############################################################################
sub Now
{
my ($InFormat) = @_;
my $RetVal = "";
my ($Seconds, $Minutes, $Hours, $Day, $MonthNumber, $YearNumber, $WeekDayNumber, $DayOfYear, $IsDayLightSavings) = localtime(time);
my $Year = $YearNumber + 1900;
my $Month = sprintf("%02d", $MonthNumber + 1);
$Day = sprintf("%02d", $Day);
$Hours = sprintf("%02d", $Hours);
$Minutes = sprintf("%02d", $Minutes);
$Seconds = sprintf("%02d", $Seconds);
if ($InFormat eq "YYYYMMDD") { $RetVal = "$Year$Month$Day"; }
elsif ($InFormat eq "YYYY-MM-DD") { $RetVal = "$Year-$Month-$Day"; }
elsif ($InFormat eq "DDMMYYYY") { $RetVal = "$Day$Month$Year"; }
elsif ($InFormat eq "DD-MM-YYYY") { $RetVal = "$Day-$Month-$Year"; }
elsif ($InFormat eq "YYYYMMDD.HH24MISS") { $RetVal = "$Year$Month$Day.$Hours$Minutes$Seconds"; }
else { $RetVal = "$Year-$Month-$Day $Hours:$Minutes:$Seconds"; }
return $RetVal;
}
##############################################################################
sub NowBlurb
{
return Now, " ", basename($0);
}
##############################################################################
sub DieWith
{
my ($MessageLine) = @_;
print NowBlurb, " $MessageLine\n";
exit 1;
}
##############################################################################
sub GetParameters
{
my ($sParameterFile, $sImportsFilesetDirectory) = @_;
my $nStartOfValue = 0;
my $nLength = 0;
if (open fhParameterFile, "<".$sParameterFile) {
while (<fhParameterFile>) {
chop;
if ($_ =~ /^ImportsFilesetDirectory/) {
$nStartOfValue = index($_, "=") + 1;
$nLength = length($_);
$$sImportsFilesetDirectory = substr($_, $nStartOfValue, $nLength - $nStartOfValue);
}
}
close fhParameterFile;
}
else {
DieWith("Error: Unable to open file ($sParameterFile).");
}
}
##############################################################################
sub IsADate
{
my ($sInDate, $sFormat) = @_;
my $sTrimmedDate = $sInDate;
my $lReturnValue = $cFalse;
#-- We are expecting a date in string format like "MM/DD/YYYY"
#-- Strip out whitespaces
$sTrimmedDate =~ s/ |\t|\n//g;
if (length($sTrimmedDate) == 10) {
#-- Correct length
if ($sFormat eq "MM/DD/YYYY") {
if (substr($sTrimmedDate,2,1) eq "/" and substr($sTrimmedDate,5,1) eq "/") {
#-- Correct slash placement
if (substr($sTrimmedDate,0,2) >= 1 and substr($sTrimmedDate,0,2) <= 12) {
#-- Valid month
if (substr($sTrimmedDate,3,2) >= 1 and substr($sTrimmedDate,3,2) <= 31) {
#-- Valid day
if (substr($sTrimmedDate,6,4) >= 1900 and substr($sTrimmedDate,6,4) <= 9999) {
#-- Valid year
$lReturnValue = $cTrue;
}
}
}
}
}
elsif ($sFormat eq "YYYY-MM-DD") {
if (substr($sTrimmedDate,4,1) eq "-" and substr($sTrimmedDate,7,1) eq "-") {
#-- Correct slash placement
if (substr($sTrimmedDate,5,2) >= 1 and substr($sTrimmedDate,5,2) <= 12) {
#-- Valid month
if (substr($sTrimmedDate,8,2) >= 1 and substr($sTrimmedDate,8,2) <= 31) {
#-- Valid day
if (substr($sTrimmedDate,0,4) >= 1900 and substr($sTrimmedDate,0,4) <= 9999) {
#-- Valid year
$lReturnValue = $cTrue;
}
}
}
}
}
}
return $lReturnValue;
}
##############################################################################
sub FormatMMDDYYYYToYYYYMMDD
{
my ($InDate) = @_;
#-- Convert MM/DD/YYYY to YYYYMMDD
my $sReturnValue = substr($InDate,6,4).substr($InDate,0,2).substr($InDate,3,2);
return $sReturnValue;
}
##############################################################################
sub FormatToFloatingPointDecimal
{
my ($oWorkSheetCell, $Precision) = @_;
my $sReturnValue = 0;
if (defined $oWorkSheetCell) {
#-- This value is not empty
$sReturnValue = sprintf("%1.${Precision}f", $oWorkSheetCell->{Val})
}
return $sReturnValue;
}
##############################################################################
sub FormatToString
{
my ($oWorkSheetCell) = @_;
my $sReturnValue = "";
if (defined $oWorkSheetCell) {
#-- This value is not empty
$sReturnValue = $oWorkSheetCell->{Val};
$sReturnValue =~ s/ |\t|\n//g;
}
return $sReturnValue;
}
##############################################################################
sub ProcessWorksheet
{
my ($fhExtractFile, $oWorkSheet) = @_;
my $sWorkSheetName = $oWorkSheet->{Name};
my $iMaxRows = $oWorkSheet->{MaxRow};
my $iMaxCols = 0;
my $iCurRow = 0;
my $iCurCol = 0;
my $nNumberOfRowWritten = 0;
print NowBlurb, " Processing WorkSheet : ", $sWorkSheetName, "...\n";
print NowBlurb, " MaxRows (zeros based) : ", $iMaxRows, "\n";
#-- Make sure we have minimum required rows (based on 1 data row + 1 header row)
if ($iMaxRows >= 1) {
#-- Loop through rows
for($iCurRow = 0; $iCurRow <= $iMaxRows; $iCurRow++) {
#-- Skip the first row
if ($iCurRow > 0) {
#-- OK let's process this row
#-- Make sure it has the appropriate number of columns
$iMaxCols = $oWorkSheet->{MaxCol};
if ($iMaxCols == 15) {
#-- This row has correct number of columns
#-- Write out row to output file
#-- 1 Group_key
print $fhExtractFile FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][0], 0);
#-- 2 Personality_key
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][1], 0);
#-- 3 Comm_pay_key
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][2], 0);
#-- 4 Category_id
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][3], 0);
#-- 5 Vendor_id
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][4], 0);
#-- 6 Item_id
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][5], 0);
#-- 7 Order_dt
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][6], 0);
#-- 8 Subsidiary_desc
print $fhExtractFile "|", FormatToString($oWorkSheet->{Cells}[$iCurRow][7]);
#-- 9 Hour_desc
print $fhExtractFile "|", FormatToString($oWorkSheet->{Cells}[$iCurRow][8]);
#-- 10 Comm_pct
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][9], 2);
#-- 11 Comm_per_unit
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][10], 2);
#-- 12 Comm_factor
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][11], 2);
#-- 13 Vendor_exclude
print $fhExtractFile "|", FormatToString($oWorkSheet->{Cells}[$iCurRow][12]);
#-- 14 Item_exclude
print $fhExtractFile "|", FormatToString($oWorkSheet->{Cells}[$iCurRow][13]);
#-- 15 Comm_beg_dt
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][14], 0);
#-- 16 Comm_end_dt
print $fhExtractFile "|", FormatToFloatingPointDecimal($oWorkSheet->{Cells}[$iCurRow][15], 0);
#-- LineFeed
print $fhExtractFile "\n";
$nNumberOfRowWritten ++;
}
else {
#-- Invalid number of columns
DieWith("Error: Row $iCurRow (zeros based) has incorrect number of columns ($iMaxCols).");
}
}
}
}
else {
#-- Invalid number of columns
DieWith("Error: WorkSheet must contain at least 2 rows.");
}
return $nNumberOfRowWritten;
}
##############################################################################
sub ProcessExcelFile
{
my ($sExcelFile, $fhExtractFile) = @_;
my $oExcel;
my $oWorkBook;
my @oWorksheetPointers = ();
my $oWorkSheet;
my $nNumberOfRowWritten = 0;
print NowBlurb, " Opening Excel spreadsheet: $sExcelFile...", "\n";
$oExcel = new Spreadsheet::ParseExcel;
$oWorkBook = $oExcel->Parse($sExcelFile);
print NowBlurb, " Validating Excel spreadsheet...", "\n";
#-- Do some basic validation on the spreadsheet
$NumberOfSheets = $oWorkBook->{SheetCount};
#-- Make sure we have exactly three sheets
if ($NumberOfSheets == 1) {
#-- Correct number of sheets
#-- Load up array of Worksheet Object Pointers
@oWorksheetPointers = (@{$oWorkBook->{Worksheet}});
foreach my $oWorkSheet (@oWorksheetPointers) {
print NowBlurb, " WorkSheet : ", $oWorkSheet, " : ", $oWorkSheet->{Name}, "\n";
}
#-- Make sure each sheet has expected name
if ($oWorksheetPointers[0]->{Name} eq "Commission") {
#-- All expected sheets exist with proper names
#-- Process each Worksheet one at a time
foreach $oWorkSheet (@oWorksheetPointers) {
$nNumberOfRowWritten += ProcessWorksheet($fhExtractFile, $oWorkSheet);
}
}
else {
#-- Invalid number of sheets
DieWith("Error: Spreadsheet contains WorkSheets with invalid names. Expecting commission");
}
}
else {
#-- Invalid number of sheets
DieWith("Error: Spreadsheet must contain only three WorkSheets ($NumberOfSheets).");
}
return $nNumberOfRowWritten;
}
##############################################################################
sub WriteReadyFile
{
my ($sExtractFile, $nNumberOfRowWritten) = @_;
my $sReadyFilePath = dirname($sExtractFile);
my $sReadyFileName = basename($sExtractFile).".rdy";
$sReadyFile = $sReadyFilePath.$cSlash.$sReadyFileName;
print NowBlurb, " Ready file name: $sReadyFile\n";
if (open (fhReadyFile, ">$sReadyFile")) {
print fhReadyFile Now("YYYYMMDD"), sprintf("%010d", $nNumberOfRowWritten), "\n";
close fhReadyFile;
print NowBlurb, " Ready file closed.\n";
}
else {
DieWith("Error: Unable to create file ($sReadyFile).");
}
}
##############################################################################
#-- Main
#-- Global variables (constants)
$cTrue = 1;
$cFalse = 0;
$cIsUnix = $cFalse;
if (!($^O =~ /Win/)) {
$cIsUnix = $cTrue;
}
$cSlash = "\\";
if ($cIsUnix) {
$cSlash = "\/";
}
#-- Local variables
my $nNumArgs = 0;
my $sParameterFile = "";
my $sFilesetDate = "";
my $sStrippedFilesetDate = "";
my $sTargXLSFile = "commission.xls";
my $sTargExportFile = "commission";
my $sImportsFilesetDirectory = "";
my $sExcelFile = "";
my $sExtractFile = "";
my $nNumberOfRowWritten = 0;
print NowBlurb, " Initializing...", "\n";
if (getopts('h')) {
if ( $opt_h ) {
ShowBlurb();
exit 2;
}
$nNumArgs = scalar(@ARGV);
if ( $nNumArgs == 2 ) {
$sParameterFile = $ARGV[0];
#-- Make sure Parameter file exists and is readable
if ( -r $sParameterFile ) {
#-- Process parameters
print NowBlurb, " Reading parameters from: $sParameterFile...\n";
GetParameters($sParameterFile, \$sImportsFilesetDirectory);
print NowBlurb, " TargXLSFile: $sTargXLSFile\n";
print NowBlurb, " TargExportFile: $sTargExportFile\n";
print NowBlurb, " ImportsFilesetDirectory: $sImportsFilesetDirectory\n";
$sFilesetDate = $ARGV[1];
#-- Make sure FilesetDate is a date
if (IsADate($sFilesetDate, "YYYY-MM-DD")) {
#-- Replace #DTE# token in ImportsFilesetDirectory
$sStrippedFilesetDate = $sFilesetDate;
$sStrippedFilesetDate =~ tr /-//d;
$sImportsFilesetDirectory =~ s/#DTE#/$sStrippedFilesetDate/;
print NowBlurb, " ImportsFilesetDirectory: $sImportsFilesetDirectory\n";
#-- Build Excel and Export file names
$sExcelFile = $sImportsFilesetDirectory.$cSlash.$sTargXLSFile;
$sExtractFile = $sImportsFilesetDirectory.$cSlash.$sTargExportFile;
print NowBlurb, " ExcelFile: $sExcelFile\n";
print NowBlurb, " ExtractFile: $sExtractFile\n";
#-- Open output file
print NowBlurb, " Opening output extract file : $sExtractFile...", "\n";
if (open (fhExtractFile, ">$sExtractFile")) {
$nNumberOfRowWritten = ProcessExcelFile($sExcelFile, \*fhExtractFile);
close fhExtractFile;
print NowBlurb, " Extract file closed. $nNumberOfRowWritten rows written.\n";
print NowBlurb, " Creating ready file...\n";
WriteReadyFile($sExtractFile, $nNumberOfRowWritten);
print NowBlurb, " Complete.\n";
}
else {
DieWith("Error: Unable to create file ($sExtractFile).");
}
}
else {
DieWith("Error: Invalid FilesetDate ($sFilesetDate).");
}
}
else {
DieWith("Error: Unable to read file ($sParameterFile).");
}
}
else {
DieWith("Error: Missing parameter. <ParameterFile> <FilesetDate>");
}
}
else {
DieWith("Error: Invalid options.");
}