01-20-2006, 08:38 PM
hello, thought somebody might find this interesting so here it is. dont forget to change the i/o files as they are currently hard-coded
Code:
' HTML Parser v1.0
' Yetifoot 2006
'
' Parses a HTML file and produces a report
'
' This program breaks down a HTML file into its constituent parts. It seperates
' tags from text. Text is treated in a similar way to a browsers treatment, ie
' it removes excess whitespace. Tags are broken down to each attribute,
' by name and value. Comment tags ('!') are split depending on their type,
' either a DTD command like DOCTYPE, or a standard comment like <!-- comment
' <style>, <script> and <title> tags are also recognised and treated
' appropriately. As the parser works, it produces a report, to show the
' working.
' Often the report will show text of just a single whitespace. This is
' important to the formatting of the output so I leave it in.
' Obsessive checking of EOF, and end of tag mean that when an error is
' encountered the parser should get back on track quickly, and not go into an
' infinite loop.
' Generally I have tried to be quite loose with the syntax, to enable non
' standard HTML to still have a chance of being understood.
' The only error i have found that can cause the parser to not recover is if
' the terminating '--' is left out of a comment. FF & IE seem to recover quite
' well from this error, but I wasn't able to think of a better way yet, without
' breaking the one pass nature of the compiler.
' On inspection of the HTML parsing functions you will see that they all are
' based on the simple loop:
' While (NOT delimiter)
' s = s & Chr(Look)
' ReadChar
' Wend
' just with small changes to deal with any particular cases.
' v1.0 update - added more parsing of style scripts
Option Explicit
Dim Shared Look As Integer
Dim Shared iFileName As String
Dim Shared iFileHan As Integer
Dim Shared oFileName As String
Dim Shared oFileHan As Integer
Dim Shared last_tagtype As String
iFileName = "C:\Documents and Settings\Jefferson\Desktop\viewtopic.htm"
oFileName = "C:\Documents and Settings\Jefferson\Desktop\report.txt"
' ---------------------------------------------------------------------------- '
' Input
' ---------------------------------------------------------------------------- '
Sub iFile_Open()
iFileHan = Freefile
Open iFileName For Binary As #iFileHan
End Sub
Sub iFile_Close()
Close #iFileHan
End Sub
Sub ReadChar()
Dim t As uByte
If EOF(iFileHan) Then
Look = -1
Else
Get #iFileHan, , t
Look = t
End If
End Sub
' ---------------------------------------------------------------------------- '
' Output
' ---------------------------------------------------------------------------- '
Sub oFile_Open()
oFileHan = FREEFILE
Open oFileName For Output As #oFileHan
End Sub
Sub oFile_Close()
Close #oFileHan
End Sub
Sub Emit(s As String)
Print #oFileHan, s;
End Sub
Sub EmitLn(s As String)
Print #oFileHan, s
End Sub
' ---------------------------------------------------------------------------- '
' Recognition and skipping
' ---------------------------------------------------------------------------- '
Function IsDash(c As Integer) As Integer
IsDash = (c = Asc("-"))
End Function
Function IsWhite(c As Integer) As Integer
IsWhite = (c = Asc(" "))
End Function
Function IsWhiteEx(c As Integer) As Integer
IsWhiteEx = ((c = 9) OR (c = 10) OR (c = 13) OR IsWhite(c))
End Function
Sub SkipDash()
While IsDash(Look)
ReadChar
Wend
End Sub
Sub SkipWhiteEx()
While IsWhiteEx(Look)
ReadChar
Wend
End Sub
' ---------------------------------------------------------------------------- '
' HTML Parsing
' ---------------------------------------------------------------------------- '
Sub Parse_Tag_Regular_Type()
' Read a tag type ie body
Dim tag_r_t As String
While (Look <> -1) AND (Look <> Asc(">")) AND (NOT IsWhiteEx(Look))
tag_r_t = tag_r_t & Chr(Look)
ReadChar
Wend
SkipWhiteEx
last_tagtype = tag_r_t
EmitLn("----------------------------------------------------------------")
EmitLn("Tag Type : " & tag_r_t)
End Sub
Sub Parse_Tag_Regular_Attribute_Name()
' Read an attribute name ie bgcolor
Dim tag_r_a_n As String
While (Look <> -1) AND (Look <> Asc(">")) AND (NOT IsWhiteEx(Look)) AND (Look <> Asc("="))
tag_r_a_n = tag_r_a_n & Chr(Look)
ReadChar
Wend
SkipWhiteEx
EmitLn("Tag Attr Name : " & tag_r_a_n)
End Sub
Sub Parse_Tag_Regular_Attribute_Value_DblQuote()
' Read an attribute value with normal quotes ie "#FFFFFF"
Dim tag_r_a_v_d As String
ReadChar ' Dump leading '"'
While (Look <> -1) AND (Look <> Asc(">")) AND (Look <> Asc(""""))
tag_r_a_v_d = tag_r_a_v_d & Chr(Look)
ReadChar
Wend
ReadChar ' Dump trailing '"'
tag_r_a_v_d = """" & tag_r_a_v_d & """"
EmitLn("Tag Attr Val : " & tag_r_a_v_d)
End Sub
Sub Parse_Tag_Regular_Attribute_Value_SngQuote()
' Read an attribute value with apostrophe quotes ie '#FFFFFF'
Dim tag_r_a_v_s As String
ReadChar ' Dump leading '
While (Look <> -1) AND (Look <> Asc(">")) AND (Look <> Asc("'"))
tag_r_a_v_s = tag_r_a_v_s & Chr(Look)
ReadChar
Wend
ReadChar ' Dump trailing '
tag_r_a_v_s = "'" & tag_r_a_v_s & "'"
EmitLn("Tag Attr Val : " & tag_r_a_v_s)
End Sub
Sub Parse_Tag_Regular_Attribute_Value_NoQuote()
' Read an attribute value with no quotes ie #FFFFFF
Dim tag_r_a_v_n As String
While (Look <> -1) AND (Look <> Asc(">")) AND (NOT IsWhiteEx(Look))
tag_r_a_v_n = tag_r_a_v_n & Chr(Look)
ReadChar
Wend
EmitLn("Tag Attr Val : " & tag_r_a_v_n)
End Sub
Sub Parse_Tag_Regular_Attribute_Value()
' We have found '=' in an attribute. We look to see if it is a quoted
' value or not and treat it appropriately
ReadChar ' Dump '='
SkipWhiteEx
Select Case Look
Case Asc("""")
Parse_Tag_Regular_Attribute_Value_DblQuote
Case Asc("'")
Parse_Tag_Regular_Attribute_Value_SngQuote
Case Else
Parse_Tag_Regular_Attribute_Value_NoQuote
End Select
SkipWhiteEx
End Sub
Sub Parse_Tag_Regular_Attributes()
' Read an attribute ie bgcolor="#FFFFFF"
' First we get the attributes name ie 'bgcolor'
' Then we check for presence of '=' to make sure there is an associated
' value,as sometimes there may not be ie nowrap (?)
' If there is a value then we read it
While (Look <> -1) AND (Look <> Asc(">"))
Parse_Tag_Regular_Attribute_Name
Select Case Look
Case Asc(">")
' This must be an attribute with no value ie 'nowrap', and also the
' end of the tag
Case Asc("=")
Parse_Tag_Regular_Attribute_Value
Case Else
' This must be an attribute with no value ie 'nowrap'
End Select
Wend
End Sub
Sub Parse_Tag_Regular()
' We have a regular tag to parse. First we get the type ie 'body', then
' we look for '>'. If '>' is not found that means we have attributes to
' read, ie bgcolor="#FFFFFF"
Parse_Tag_Regular_Type
Select Case Look
Case Asc(">")
' This must be a tag with no attributes ie '<html>'
Case Else
Parse_Tag_Regular_Attributes
End Select
End Sub
Sub Parse_Tag_Comment_Regular
' Read a regular comment such as <!-- Your Comment Here
' We keep the last character so that we can check for the presense of '--'
' which is our terminator.
Dim tag_c_r As String
Dim last As Integer
SkipDash ' Dump leading '--'
While Look <> -1
tag_c_r = tag_c_r & Chr(Look)
last = Look
ReadChar
If IsDash(last) AND IsDash(Look) Then
Exit While
End If
Wend
tag_c_r = Left(tag_c_r, Len(tag_c_r) - 1) ' Trim off the leftover '-'
SkipDash ' Skip '-'
EmitLn("----------------------------------------------------------------")
EmitLn("Comment : " & tag_c_r)
End Sub
Sub Parse_Tag_Comment_DTD
' Read a DTD comment such as <!DOCTYPE HTML...>
Dim tag_c_d As String
While (Look <> -1) AND (Look <> Asc(">"))
tag_c_d = tag_c_d & Chr(Look)
ReadChar
Wend
EmitLn("----------------------------------------------------------------")
EmitLn("DTD : " & tag_c_d)
End Sub
Sub Parse_Tag_Comment()
' We have found '<!' This means we are in a comment, so we check if it
' starts with '-' which would signify a regular comment, otherwise it is
' a DTD such as <!DOCTYPE HTML...>
ReadChar ' Dump leading '!'
Select Case Look
Case Asc("-")
Parse_Tag_Comment_Regular
Case Else
Parse_Tag_Comment_DTD
End Select
End Sub
Sub Parse_Tag()
' We have found '<', now we look for '!'. If '!' is present then it is
' a comment, otherwise it is a normal tag such as 'table'
ReadChar ' Dump leading '<'
SkipWhiteEx
Select Case Look
Case Asc("!")
Parse_Tag_Comment
Case Else
Parse_Tag_Regular
End Select
ReadChar ' Dump trailing '>'
End Sub
Sub Parse_Text()
' We have some text to parse. We strip out any excessive white space, in a
' similar way to browsers.
Dim text As String
If IsWhiteEx(Look) Then
text = text & " "
SkipWhiteEx
End If
While (Look <> -1) AND (Look <> Asc("<"))
text = text & Chr(Look)
ReadChar
If IsWhiteEx(Look) Then
text = text & " "
SkipWhiteEx
End If
Wend
EmitLn("----------------------------------------------------------------")
EmitLn("Text : " & text)
End Sub
Sub Parse_Page_Title()
' We have found <title>, now we do the same as if it were text
Dim p_t As String
If IsWhiteEx(Look) Then
p_t = p_t & " "
SkipWhiteEx
End If
While (Look <> -1) AND (Look <> Asc("<"))
p_t = p_t & Chr(Look)
ReadChar
If IsWhiteEx(Look) Then
p_t = p_t & " "
SkipWhiteEx
End If
Wend
EmitLn("----------------------------------------------------------------")
EmitLn("Page Title : " & p_t)
End Sub
Sub Parse_Style_Script_Comment()
Dim last As Integer
Dim style_s_c As String
ReadChar ' Dump '/'
Select Case Look
Case Asc("*")
ReadChar ' Dump '*'
While Look <> -1
style_s_c = style_s_c & Chr(Look)
last = Look
ReadChar
If (last = Asc("*")) AND (Look = Asc("/")) Then
Exit While
End If
Wend
style_s_c = Left(style_s_c, Len(style_s_c) - 1) ' Trim off the leftover '*'
ReadChar ' Dump '/'
SkipWhiteEx
EmitLn("----------------------------------------------------------------")
EmitLn("Style Script Comment : " & style_s_c)
Case Asc("/")
ReadChar ' Dump '/'
Select Case Look
Case Asc("-")
Exit Sub
Case Else
While (Look <> -1) AND (Look <> 10) AND (Look <> 13)
style_s_c = style_s_c & Chr(Look)
ReadChar
Wend
SkipWhiteEx
EmitLn("----------------------------------------------------------------")
EmitLn("Style Script Comment : " & style_s_c)
End Select
Case Else
Print "UNKNOWN"
End Select
End Sub
Sub Parse_Style_Script_Function()
Dim style_s_f As String
While (Look <> -1) AND (Look <> Asc(";"))
style_s_f = style_s_f & Chr(Look)
ReadChar
Wend
style_s_f = style_s_f & Chr(Look) ' Append ';'
ReadChar ' Dump ';'
SkipWhiteEx
EmitLn("----------------------------------------------------------------")
EmitLn("Style Script Function : " & style_s_f)
End Sub
Sub Parse_Style_Script_Class_List()
Dim style_s_c_l As String
While (Look <> -1) AND (Look <> Asc("{"))
style_s_c_l = style_s_c_l & Chr(Look)
ReadChar
Wend
SkipWhiteEx
EmitLn("----------------------------------------------------------------")
EmitLn("Style Script Classes : " & style_s_c_l)
End Sub
Sub Parse_Style_Script_Class_Attributes()
Dim style_s_c_a As String
ReadChar ' Dump Leading '{'
SkipWhiteEx
While (Look <> -1) AND (Look <> Asc("}"))
style_s_c_a = style_s_c_a & Chr(Look)
ReadChar
Wend
ReadChar ' Dump Trailing '}'
SkipWhiteEx
EmitLn("Style Script Class Attributes : " & style_s_c_a)
End Sub
Sub Parse_Style_Script_Commented()
ReadChar ' Dump '<'
ReadChar ' Dump '!'
SkipDash ' Skip '-'
SkipWhiteEx
While Look <> -1
Select Case Look
Case Asc("-")
SkipDash
ReadChar ' Dump '>'
Exit While
Case Asc("/")
Parse_Style_Script_Comment
Case Asc("@")
Parse_Style_Script_Function
Case Else
Parse_Style_Script_Class_List
Parse_Style_Script_Class_Attributes
End Select
Wend
SkipWhiteEx
End Sub
Sub Parse_Style_Script_NoComment()
' We have a non-commented style script. we look for '<' to end, which is
' not ideal
Dim style_s_n As String
While (Look <> -1) AND (Look <> Asc("<"))
style_s_n = style_s_n & Chr(Look)
ReadChar
Wend
EmitLn("----------------------------------------------------------------")
EmitLn("Style Script : " & style_s_n)
End Sub
Sub Parse_Style_Script()
' Read a script inside of <style>...</style>
SkipWhiteEx
Select Case Look
Case Asc("<")
Parse_Style_Script_Commented
Case Else
Parse_Style_Script_NoComment
End Select
End Sub
Sub Parse_Script_Commented()
' We have a commented script. we look for '--' to end, as it is
' the same as a <!-- Comment
Dim script_c As String
Dim last As Integer
ReadChar ' Dump '<'
ReadChar ' Dump '!'
SkipDash ' Skip '-'
While Look <> -1
script_c = script_c & Chr(Look)
last = Look
ReadChar
If IsDash(last) AND IsDash(Look) Then
Exit While
End If
Wend
script_c = Left(script_c, Len(script_c) - 1) ' Trim off the leftover '-'
SkipDash ' Skip '-'
ReadChar ' Dump '>'
SkipWhiteEx
EmitLn("----------------------------------------------------------------")
EmitLn("Script : " & script_c)
End Sub
Sub Parse_Script_NoComment()
' We have a non-commented script. we look for '<' to end, which is
' not ideal
Dim script_n As String
While (Look <> -1) AND (Look <> Asc("<"))
script_n = script_n & Chr(Look)
ReadChar
Wend
EmitLn("----------------------------------------------------------------")
EmitLn("Script : " & script_n)
End Sub
Sub Parse_Script()
' Read a script inside of <script>...</script>
SkipWhiteEx
Select Case Look
Case Asc("<")
Parse_Script_Commented
Case Else
Parse_Script_NoComment
End Select
End Sub
Sub Parse()
' The first thing we look for is '<' , if this is found then we are parsing
' a tag, otherwise it is regular text.
ReadChar
While Look <> -1
Select Case Look
Case Asc("<")
last_tagtype = ""
Parse_Tag
Select Case UCase(last_tagtype)
Case "STYLE"
Parse_Style_Script
Case "TITLE"
Parse_Page_Title
Case "SCRIPT"
Parse_Script
Case Else
' No special treatment for this tag (yet!)
End Select
Case Else
Parse_Text
End Select
Wend
End Sub
' ---------------------------------------------------------------------------- '
' Main
' ---------------------------------------------------------------------------- '
iFile_Open
oFile_Open
Parse
iFile_Close
oFile_Close