public class TemplatedHtmlDocData extends DocData
Constructor and Description |
---|
TemplatedHtmlDocData(java.lang.String urlString)
Create a new HtmlDocData object from the provided url.
|
TemplatedHtmlDocData(java.net.URL url)
Create a new HtmlDocData object from the provided url.
|
Modifier and Type | Method and Description |
---|---|
java.lang.String |
getBody()
Gets the body.
|
protected java.lang.String |
getBodyClass()
Get the body class for this HtmlDocData
|
protected java.lang.String |
getBodyTag()
Get the body tag for this HtmlDocData
|
java.util.HashSet<java.net.URL> |
getLinks()
Get all the links contained in this HTML document
|
java.util.List<java.lang.String> |
getSections()
Return the sections of the document in order.
|
java.lang.String |
getSectionTag()
Get the section tag
|
java.util.Set<java.text.SimpleDateFormat> |
getTimeStampFormats()
Get the timestamp format
|
java.lang.String |
getTimeStampTag()
Get the timestamp tag
|
protected java.lang.String |
getTitleTag()
Get the title tag for this HtmlDocData
|
java.net.URL |
getURL()
Get the URL for this particular HtmlDocData object.
|
boolean |
loadContent()
Load the content.
|
void |
setBodyTag(java.lang.String tag)
Set the body tag.
|
void |
setBodyTag(java.lang.String tag,
java.lang.String tagClass)
Set the body tag and class.
|
void |
setSectionTag(java.lang.String tag)
Set the section tag.
|
void |
setTimeStampTag(java.lang.String metaTag,
java.lang.Iterable<java.text.SimpleDateFormat> formats)
Set the tag used to identify the time stamp in an HTML document, and a
set of formats used to parse the tag.
|
void |
setTimeStampTag(java.lang.String metaTag,
java.lang.String format)
Set the tag used to identify the time stamp in an HTML document, and a
set of formats used to parse the tag.
|
void |
setTimeStampTag(java.lang.String metaTag,
java.lang.String[] formats)
Set the tag used to identify the time stamp in an HTML document, and a
set of formats used to parse the tag.
|
void |
setTitleTag(java.lang.String tag)
Set the title tag.
|
TemplatedHtmlDocData |
templateDocData(java.lang.String urlString)
Create a new TemplatedHtmlDocData object from the provided url, using
this TemplatedHtmlDocData object as a template.
|
TemplatedHtmlDocData |
templateDocData(java.net.URL url)
Create a new HtmlDocData object from the provided url, using this
HtmlDocData object as a template.
|
addKey, addKey, addKeyMap, calculateSimilarity, clearKeys, equals, getKeyMap, getKeys, getLink, getName, getSummary, getTimeStamp, getValueForKey, hashCode, matchesKey, setBody, setLink, setName, setSummary, setTimeStamp, toString
public TemplatedHtmlDocData(java.lang.String urlString) throws java.net.MalformedURLException
urlString
- the URL string to the documentjava.net.MalformedURLException
- if the URL is not a valid URLpublic TemplatedHtmlDocData(java.net.URL url)
url
- the URL for the documentpublic java.lang.String getBody()
protected java.lang.String getBodyClass()
protected java.lang.String getBodyTag()
public java.util.HashSet<java.net.URL> getLinks()
public java.util.List<java.lang.String> getSections()
getSections
in class DocData
public java.lang.String getSectionTag()
public java.util.Set<java.text.SimpleDateFormat> getTimeStampFormats()
public java.lang.String getTimeStampTag()
protected java.lang.String getTitleTag()
public java.net.URL getURL()
public boolean loadContent()
loadContent
in class DocData
public void setBodyTag(java.lang.String tag)
tag
- the tag defining the content of the documentpublic void setBodyTag(java.lang.String tag, java.lang.String tagClass)
tag
- the tag defining the content of the documenttagClass
- the class name for the contentpublic void setSectionTag(java.lang.String tag)
tag
- the section tag.public void setTimeStampTag(java.lang.String metaTag, java.lang.Iterable<java.text.SimpleDateFormat> formats)
metaTag
- The string corresponding to the META tag name.formats
- a set of date format stringspublic void setTimeStampTag(java.lang.String metaTag, java.lang.String format)
metaTag
- The string corresponding to the META tag name.format
- a date format string indicating the time format usedpublic void setTimeStampTag(java.lang.String metaTag, java.lang.String[] formats)
metaTag
- The string corresponding to the META tag name.formats
- a set of date format stringspublic void setTitleTag(java.lang.String tag)
tag
- the title tag.public TemplatedHtmlDocData templateDocData(java.lang.String urlString) throws java.net.MalformedURLException
templateDocData(new URL(urlString))
urlString
- the URL string to the documentjava.net.MalformedURLException
- if the URL is not a valid URLpublic TemplatedHtmlDocData templateDocData(java.net.URL url)
url
- the URL for the document