Delphi: Some tip to parse this html table?

2020-07-19 06:21发布

问题:

some time I'm trying to get data from this html table, I tried components paid and free. I tried to do some coding and also got no results. I have a class that throw directly html tables for ClientDataSet, but with this table it does not work. Anyone have any tips on how to get the data in this html table? Or a way to convert it to txt / xls / csv or xml? Follows the code for the table:

  WebBrowser1.Navigate('http://site2.aesa.pb.gov.br/aesa/monitoramentoPluviometria.do?metodo=listarMesesChuvasMensais');
  WebBrowser1.OleObject.Document.All.Tags('select').Item(0).Value:= '2013';
  WebBrowser1.OleObject.Document.All.Tags('select').Item(1).Value:= '7';
  WebBrowser1.OleObject.Document.All.Tags('input').Item(1).click;
  Memo1.Text:= WebBrowser1.OleObject.Document.All.Tags('table').Item(10).InnerHTML;
  Memo1.Lines.SaveToFile('table.html');

回答1:

The following will extract the data from the HTML table on your target page and load it into a ClientDataSet.

It's fairly long-winded, perhaps demonstrating that as David said, Delphi is maybe not the best tool for the job.

On my Form1, I have a TEdit, edValue, for me to key in the value in the first data row in the HTML table data. I use this as a way to find the table in the HTML document. I dare say there are better methods, but at least my method should be more robust than hard-coding assumptions about the layout of the document in which the table is embedded that maybe won't survive a change by the page's author.

Broadly, the code works by first finding the HTML table cell using the contents of my edValue.Text, then finding the table to which the cell belongs, and then populating the CDS's Fields and data from the table.

The CDS fields are set to 255 characters by default; maybe there's a specification for the data published on the web page that would allow you to use a smaller value for some, if not all, fields. They're all assumed to be of type ftString, to avoid the code choking on unexpected cell contents.

Btw, at the bottom is a utility function for saving the HTML page locally, to save having to keep clicking the button for selecting a year + month. To reload the WebBrowser from the saved file, just use the file's name as the URL to load.

TForm1 = class(TForm)
[ ... ]
public
  { Public declarations }
  Doc : IHtmlDocument2;

procedure TForm1.btnFindValueClick(Sender: TObject);
var
  Table : IHTMLTable;
begin
  Doc := WebBrowser1.Document as IHTMLDocument2;
  Table := FindTableByCellValue(edValue.Text);
  Assert(Table <> Nil);
  LoadCDSFromHTMLTable(CDS, Table);
end;

procedure TForm1.LoadCDSFromHTMLTable(DestCDS : TClientDataSet; Table : IHTMLTable);
var
  I,
  J : Integer;
  vTable : OleVariant;
  iRow : IHTMLTableRow;
  FieldName,
  FieldValue : String;
  Field : TField;
const
  cMaxFieldSize = 255;
  scIDFieldName = 'ID';
begin
  //  Use OleVariant instead of IHTMLTable becuse it's less fiddly for doing what follows
  vTable := Table;
  Assert(not DestCDS.Active and (DestCDS.FieldCount = 0));

  //  First create an AutoInc field
  Field := TAutoIncField.Create(Self);
  Field.FieldName := scIDFieldName;
  Field.DataSet := DestCDS;


  // Next create CDS fields from the names in the cells in the first row of the table
  for I := 0 to (vTable.Rows.Item(0).Cells.Length - 1) do begin
    FieldName := vTable.Rows.Item(0).Cells.Item(I).InnerText;
    Field := TStringField.Create(Self);
    // At this point, we might want to clean up the FieldName by removing embedded spaces, etc
    Field.FieldName := FieldName;
    Field.Size := cMaxFieldSize;
    Field.DataSet := DestCDS;
  end;

  DestCDS.DisableControls;
  try
    DestCDS.IndexFieldNames := scIDFieldName;
    DestCDS.CreateDataSet;

    //  Next load the HTML table data into the CDS
    for I := 1 to (vTable.Rows.Length - 1) do begin
      DestCDS.Insert;
      for J := 0 to vTable.Rows.Item(0).Cells.Length - 1 do begin
        FieldValue := vTable.Rows.Item(I).Cells.Item(J).InnerText;
        // the J + 1 is because Fields[0] is the autoinc one
        DestCDS.Fields[J + 1].AsString := FieldValue;
      end;
      DestCDS.Post;
    end;
    DestCDS.First;
  finally
    DestCDS.EnableControls;
  end;
end;

function TForm1.FindTableCellByTagValue(Doc : IHtmlDocument2; const AValue : String) : IHTMLTableCell;
var
  All: IHTMLElementCollection;
  Value: String;
  I,
  Len: Integer;
  E: OleVariant;
  iE : IHTMLElement;
  iT : IHTMLTextElement;
  iC : IHTMLTableCell;
begin
  Result := Nil;
  All := Doc.All;
  if All = Nil then Exit;
  Len := All.Length;

  for I := 0 to Len - 1 do begin
    E := All.Item(I, varEmpty);
    iE := IDispatch(E) as IHTMLElement;
    if Supports(iE, IHTMLTableCell, iC) then begin
      Value := Trim(iE.Get_InnerText);
      if Pos(Trim(AValue), Value) = 1 then begin
        Result := iC;
        Break;
      end
    end
    else
      Continue;
  end;
end;

function TForm1.FindTableByCellValue(Value : String): IHTMLTable;
var
  Node : IHtmlElement;
  iTable : IHTMLTable;
  iCell : IHTMLTableCell;
begin
  Result := Nil;
  iCell := FindTableCellByTagValue(Doc, edValue.Text);
  if iCell = Nil then
    Exit;
  Node := IDispatch(iCell) as IHtmlElement;

  //  if we found a Node with the cell text we were looking for,
  //  we can now find the HTML table to which it belongs

  while Node <> Nil do begin
    Node := Node.parentElement;
    if Supports(Node, IHTMLTable, iTable) then begin
      Result := iTable;
      Break;
    end;
  end;
end;

procedure TForm1.SaveFileLocally(const FileName : String);
var
  PFile: IPersistFile;  // declared in ActiveX unit
begin
  PFile := Doc as IPersistFile;
  PFile.Save(StringToOleStr(FileName), False);
end;


回答2:

after some time studying I finally extract data from html table. To simplify I can extract data from html table directly, without having to 'parse' it was the tag 'table' and 'item' 11 the 'item' 10 had the same data but in a single cell. So what I did, I took each element of the table in html and StringGrid filled one, and then found a way to directly populate the dbgrid through ClientDataSet. I'll post the code (unit) to stand as an example and for that you need someone. I wanted to thank everyone who helped me in the comments. With more study'm seeing that the best way to do this procedure is to MSHTML.

unit Unit1;

interface

uses
  Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
  Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.OleCtrls, SHDocVw, Vcl.StdCtrls,
  Vcl.Grids, Vcl.DBGrids, Data.DB, Datasnap.DBClient;

type
  TForm1 = class(TForm)
    WebBrowser1: TWebBrowser;
    DBGrid1: TDBGrid;
    StringGrid1: TStringGrid;
    Button1: TButton;
    Button2: TButton;
    ClientDataSet1: TClientDataSet;
    DataSource1: TDataSource;
    ClientDataSet1MunicípioPosto: TStringField;
    ClientDataSet1TotalMensalmm: TStringField;
    ClientDataSet1ClimatologiaMensalmm: TStringField;
    ClientDataSet1Desviomm: TStringField;
    ClientDataSet1Desvio: TStringField;
    ClientDataSet1id: TAutoIncField;
    procedure FormCreate(Sender: TObject);
    procedure Button1Click(Sender: TObject);
    procedure Button2Click(Sender: TObject);
  private
    { Private declarations }
  public
    { Public declarations }
  end;

var
  Form1: TForm1;

implementation

{$R *.dfm}

procedure TForm1.Button1Click(Sender: TObject);
var
 irow, jcol: Integer;
 ovTable: OleVariant;
begin

 ovTable := WebBrowser1.OleObject.Document.all.tags('table').item(11);
 ShowMessage('Number of Rows: '+IntToStr(ovTable.Rows.Length));
 ShowMessage('Number of Cols: '+IntToStr(ovTable.Rows.Item(0).Cells.Length));
 StringGrid1.RowCount:= ovTable.Rows.Length+1;
 StringGrid1.ColCount:= ovTable.Rows.Item(0).Cells.Length+1;
 for irow := 0 to (ovTable.Rows.Length - 1) do
 begin
   for jcol := 0 to (ovTable.Rows.Item(irow).Cells.Length - 1) do
   begin
     StringGrid1.Cells[jcol+1, irow+1] := ovTable.Rows.Item(irow).Cells.Item(jcol).InnerText;
   end;
 end;
end;

procedure TForm1.Button2Click(Sender: TObject);

var
iRow : Integer;
iCol : Integer;
ovTable: OleVariant;

begin
  ovTable := WebBrowser1.OleObject.Document.all.tags('table').item(11);
  for iRow := 1 to (ovTable.Rows.Length - 1) do
    begin
      ClientDataSet1.Open;
      ClientDataSet1.insert;
      for iCol := 0 to (ovTable.Rows.Item(iRow).Cells.Length - 1) do
      begin
      ClientDataSet1.FieldByname('Município/Posto').AsString:=ovTable.Rows.Item(iRow).Cells.Item(0).InnerText;
      ClientDataSet1.FieldByname('Total Mensal (mm)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(1).InnerText;
      ClientDataSet1.FieldByname('Climatologia Mensal (mm)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(2).InnerText;
      ClientDataSet1.FieldByname('Desvio (mm)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(3).InnerText;
      ClientDataSet1.FieldByname('Desvio (%)').AsString:=ovTable.Rows.Item(iRow).Cells.Item(4).InnerText;

      end;
      ClientDataSet1.Post;
      ClientDataSet1.IndexFieldNames:= 'id';
      ClientDataSet1.First;
  end;

end;

procedure TForm1.FormCreate(Sender: TObject);
begin
  WebBrowser1.Navigate('C:\htmlwiththetable.html');
end;

end

.