我想从网址中提取美国专利名称
(更新:正如评论所指出的那样,专利标题没有标注为"标题";但是,它在网页上一直显示在"摘要"之上.)在大多数情况下,它在"身体"的第7个子元素中或文件中的第3个"字体"元素,但偶尔会在页面顶部发出"**请参见图像:(校正证书)**"或"(复审证书)"的两种方法在你到达标题之前,通过插入一个额外的"body"子项和三个额外的"font"元素来提取.
但是,标题似乎始终是第一个"字体"元素,属性"大小"具有值"+1".不幸的是,其他元素的大小=" - 1",包括并不总是存在的上述元素,因此必须具体使用该属性和值.我已经搜索但无法弄清楚如何按属性和值获取元素.这是我的代码:
Function Test_UpdateTitle(url As String)
Dim title As String
Dim pageSource As String
Dim xml_obj As XMLHTTP60
Set xml_obj = CreateObject("MSXML2.XMLHTTP")
xml_obj.Open "GET", url, False
xml_obj.send
pageSource = xml_obj.responseText
Set xml_obj = Nothing
Dim html_doc As HTMLDocument
Set html_doc = CreateObject("HTMLFile")
html_doc.body.innerHTML = pageSource
Dim fontElement As IHTMLElement
'Methods 1 and 2 fail in cases of a certificate of correction or reexamination certificate
'Method 1
' Dim body As IHTMLElement
' Set body = html_doc.getElementsByTagName("body").Item(0)
' Set fontElement = body.Children(6)
'Method 2
' Set fontElement = html_doc.getElementsByTagName("font").Item(3)
'Method 3
Dim n As Integer
For n = 3 To html_doc.getElementsByTagName("font").Length - 1
Set fontElement = html_doc.getElementsByTagName("font").Item(n)
If InStr(fontElement.innerText, "Please see") = 0 And _
InStr(fontElement.innerText, "( Certificate of Correction )") = 0 And _
InStr(fontElement.innerText, "( Reexamination Certificate )") = 0 And _
InStr(fontElement.innerText, " **") = 0 Then
Test_UpdateTitle = fontElement.innerText
Exit Function
End If
Next n
End Function
Run Code Online (Sandbox Code Playgroud)
我应该补充一点,"**"不能跳过最后一个元素<b> **</b>,我得到"**"作为标题,其中有通知请看图像.在这种情况下星号是通配符吗?
你可以试试这个。只要它的第一个字体标签具有大小属性且值为“+1”,就应该可以工作。我只测试了 3 个不同的页面,但它们都返回了正确的结果。
Function Test_UpdateTitle(url)
title = "Title Not Found!"
Set xml_obj = CreateObject("MSXML2.XMLHTTP")
xml_obj.Open "GET", url, False
xml_obj.send
pageSource = xml_obj.responseText
Set xml_obj = Nothing
Set document = CreateObject("HTMLFile")
document.write pageSource
For i = 0 To document.getElementsByTagName("font").length - 1
If document.getElementsByTagName("font")(i).size = "+1" Then
title = document.getElementsByTagName("font")(i).innerText
Exit For
End If
Next
Test_UpdateTitle = title
End Function
MsgBox Test_UpdateTitle("http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=6293874.PN.&OS=PN/6293874&RS=PN/6293874")
MsgBox Test_UpdateTitle("http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&p=1&u=%2Fnetahtml%2FPTO%2Fsearch-bool.html&r=1&f=G&l=50&co1=AND&d=PTXT&s1=fight.TI.&OS=TTL/fight&RS=TTL/fight")
MsgBox Test_UpdateTitle("http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO2&Sect2=HITOFF&u=%2Fnetahtml%2FPTO%2Fsearch-adv.htm&r=14&f=G&l=50&d=PTXT&p=1&S1=search&OS=search&RS=search")
Run Code Online (Sandbox Code Playgroud)