如何使用Excel VBA查找/提取属性大小="+ 1"的HTML"font"元素

Pat*_*iee 5 html excel vba

我想从网址中提取美国专利名称

http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=6293874.PN.&OS=PN/6293874&RS = PN/6293874

(更新:正如评论所指出的那样,专利标题没有标注为"标题";但是,它在网页上一直显示在"摘要"之上.)在大多数情况下,它在"身体"的第7个子元素中或文件中的第3个"字体"元素,但偶尔会在页面顶部发出"**请参见图像:(校正证书)**"或"(复审证书)"的两种方法在你到达标题之前,通过插入一个额外的"body"子项和三个额外的"font"元素来提取.

但是,标题似乎始终是第一个"字体"元素,属性"大小"具有值"+1".不幸的是,其他元素的大小=" - 1",包括并不总是存在的上述元素,因此必须具体使用该属性和值.我已经搜索但无法弄清楚如何按属性和值获取元素.这是我的代码:

Function Test_UpdateTitle(url As String)
    Dim title As String
    Dim pageSource As String
    Dim xml_obj As XMLHTTP60
    Set xml_obj = CreateObject("MSXML2.XMLHTTP")
    xml_obj.Open "GET", url, False
    xml_obj.send
    pageSource = xml_obj.responseText
    Set xml_obj = Nothing

    Dim html_doc As HTMLDocument
    Set html_doc = CreateObject("HTMLFile")
    html_doc.body.innerHTML = pageSource
    Dim fontElement As IHTMLElement

'Methods 1 and 2 fail in cases of a certificate of correction or reexamination certificate

'Method 1

'    Dim body As IHTMLElement
'    Set body = html_doc.getElementsByTagName("body").Item(0)
'    Set fontElement = body.Children(6)

'Method 2
'    Set fontElement = html_doc.getElementsByTagName("font").Item(3)

'Method 3

    Dim n As Integer
    For n = 3 To html_doc.getElementsByTagName("font").Length - 1
        Set fontElement = html_doc.getElementsByTagName("font").Item(n)
        If InStr(fontElement.innerText, "Please see") = 0 And _
        InStr(fontElement.innerText, "( Certificate of Correction )") = 0 And _
        InStr(fontElement.innerText, "( Reexamination Certificate )") = 0 And _
        InStr(fontElement.innerText, " **") = 0 Then
            Test_UpdateTitle = fontElement.innerText
            Exit Function
        End If
    Next n

End Function
Run Code Online (Sandbox Code Playgroud)

我应该补充一点,"**"不能跳过最后一个元素<b> **</b>,我得到"**"作为标题,其中有通知请看图像.在这种情况下星号是通配符吗?

Sea*_*ell 1

你可以试试这个。只要它的第一个字体标签具有大小属性​​且值为“+1”,就应该可以工作。我只测试了 3 个不同的页面,但它们都返回了正确的结果。

Function Test_UpdateTitle(url)
    title = "Title Not Found!"
    Set xml_obj = CreateObject("MSXML2.XMLHTTP")
    xml_obj.Open "GET", url, False
    xml_obj.send
    pageSource = xml_obj.responseText
    Set xml_obj = Nothing

    Set document = CreateObject("HTMLFile")
    document.write pageSource   

    For i = 0 To document.getElementsByTagName("font").length - 1
        If document.getElementsByTagName("font")(i).size = "+1" Then
            title = document.getElementsByTagName("font")(i).innerText
            Exit For
        End If
    Next

    Test_UpdateTitle = title

End Function

MsgBox Test_UpdateTitle("http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=6293874.PN.&OS=PN/6293874&RS=PN/6293874")
MsgBox Test_UpdateTitle("http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&co1=AND&d=PTXT&s1=fight.TI.&OS=TTL/fight&RS=TTL/fight")
MsgBox Test_UpdateTitle("http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=14&f=G&l=50&d=PTXT&p=1&S1=search&OS=search&RS=search")
Run Code Online (Sandbox Code Playgroud)