PoshCode Logo PowerShell Code Repository

Parse HTML Tables by Carter Shanklin 5 years ago
embed code: <script type="text/javascript" src="http://PoshCode.org/embed/561"></script>download | new post

A function to parse tables out of HTML files and return them as PowerShell objects.

  1. # Parse tables within HTML files and return the rows as PowerShell objects.
  2. # The idea here is similar to (though not nearly as complete as) Perl's HTML::TableParse.
  3. # This function should run anywhere but it's a bit slow because of the COM interface
  4. # it uses. There seem to be a few .NET libraries out there that would make it a lot
  5. # faster but you may not have those installed. Please improve this if you
  6. #
  7. # One other quirk is that this function will only return one table at a time, through
  8. # the tableNumber parameter. If you need to extract multiiple tables you need to make
  9. # multiple calls. This was done because PowerShell seems to make it difficult to make
  10. # arrays of arrays, preferring one big happy array instead. Please improve if you
  11. # know how.
  12. #
  13. # TODO: Make it run faster.
  14.  
  15. function get-rowInner {
  16.         param($inputObject, $unique=0, $trim=0)
  17.  
  18.         $values = @()
  19.         foreach ($obj in $inputObject) {
  20.                 if ($obj.nodeName -eq "TD" -or $obj.nodeName -eq "TH") {
  21.                         $value = $obj.IHTMLElement_innerText
  22.                         if ($trim) {
  23.                                 $value = $value.trim()
  24.                         }
  25.                         if ($unique) {
  26.                                 if ($values -contains $value) {
  27.                                         $i = 2
  28.                                         while ($values -contains ($value + $i)) {
  29.                                                 $i++
  30.                                         }
  31.                                         $values += ($value + $i)
  32.                                 } else {
  33.                                         $values += $value
  34.                                 }
  35.                         } else {
  36.                                 $values += $value
  37.                         }
  38.                 }
  39.         }
  40.  
  41.         if ($values.length -gt 0) {
  42.                 return $values
  43.         } else {
  44.                 return $null
  45.         }
  46. }      
  47.  
  48. function get-row {
  49.         param($inputObject, $unique=0, $trim=0)
  50.  
  51.         if ($inputObject.nodeName -eq "TR") {
  52.                 # We are at the row level.
  53.                 return get-rowInner -inputObject $inputObject.childnodes -unique $unique -trim $trim
  54.         } else {
  55.                 # Rows can be nested inside other tags.
  56.                 foreach ($node in $inputObject.childnodes) {
  57.                         $row = get-row -inputObject $node -unique $unique -trim $trim
  58.                         if ($row -ne $null) {
  59.                                 return $row
  60.                         }
  61.                 }
  62.         }
  63. }
  64.  
  65. function get-table {
  66.         param($inputObjects)
  67.  
  68.         # We treat the first row as column headings.
  69.         $headings = $null
  70.         $rows = @()
  71.  
  72.         foreach ($obj in $inputObjects) {
  73.                 if ($headings -eq $null) {
  74.                         # The first row will be the headings.
  75.                         $headings = get-row -inputObject $obj -unique 1 -trim 1
  76.                         continue
  77.                 }
  78.  
  79.                 $row = get-row -inputObject $obj
  80.                 if ($row -ne $null -and $row.length -eq $headings.length) {
  81.                         $rowObject = new-object psobject
  82.                         for ($i = 0; $i -lt $headings.length; $i++) {
  83.                                 $value = $row[$i]
  84.                                 if ($value -eq $null) {
  85.                                         $value = ""
  86.                                 }
  87.                                 $rowObject | add-member -type noteproperty -name $headings[$i] -value $value
  88.                         }
  89.                         $rows += $rowObject
  90.                 }
  91.         }
  92.  
  93.         return $rows
  94. }
  95.  
  96. function Parse-HtmlTableRecursive {
  97.         param($inputObjects)
  98.  
  99.         foreach ($_ in $inputObjects) {
  100.                 if ($_.nodeName -eq "TBODY") {
  101.                         if (--$global:htmlParseCount -eq 0) {
  102.                                 return get-table -inputObjects $_.childnodes
  103.                         }
  104.                 }
  105.  
  106.                 if ($_.childnodes -ne $null) {
  107.                         $table = Parse-HtmlTableRecursive -inputObjects $_.childnodes
  108.                         if ($table) {
  109.                                 return $table
  110.                         }
  111.                 }
  112.         }
  113.  
  114.         return $null
  115. }
  116.  
  117. function Parse-HtmlTable {
  118.         param($url, $tableNumber=1)
  119.  
  120.         $client = new-object net.webclient
  121.         $htmltext = $client.downloadstring($url)
  122.  
  123.         # For testing local files
  124.         #$temp = gc $url
  125.         #$htmltext = ''
  126.         #for ($i = 0; $i -lt $temp.length; $i++) {
  127.         #       $htmltext += $temp[$i]
  128.         #}
  129.  
  130.         $global:htmlParseCount = $tableNumber
  131.         $h = new-object -com "HTMLFILE"
  132.         $h.IHTMLDocument2_write($htmltext)
  133.         $ret = Parse-HtmlTableRecursive -inputObject $h.body
  134.         remove-variable -scope global htmlParseCount
  135.         return $ret
  136. }
  137.  
  138. # Example: Get the 250 most common words in the English language.
  139. # Parse-HtmlTable -url http://esl.about.com/library/vocabulary/bl1000_list1.htm
  140. # Parse-HtmlTable -url http://esl.about.com/library/vocabulary/bl1000_list1.htm | select Word, Word2

Submit a correction or amendment below (
click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.

Syntax highlighting:


Remember me