Documentation
¶
Index ¶
- func AskForString(prompt string) string
- func ExtractTable(pageSource *html.Node, tableRowsExpression string) ([]*html.Node, error)
- func ExtractText(node *html.Node, nodeExpression string, Dirt string) (string, error)
- func FindNodes(node *html.Node, nodeExpression string) ([]*html.Node, error)
- func GetElementAttributeFromNode(node *html.Node, xpathExpr, attribute string) (string, error)
- func ParseHtmlToString(pageSource *html.Node) (string, error)
- func ParseStringToHtmlNode(pageSource string) (*html.Node, error)
- type Navigator
- func (nav *Navigator) CaptureScreenshot(nameFile string) error
- func (nav *Navigator) CheckPageTitle(url string) (bool, error)
- func (nav *Navigator) CheckRadioButton(selector string) error
- func (nav *Navigator) ClickButton(selector string) error
- func (nav *Navigator) ClickElement(selector string) error
- func (nav *Navigator) Close()
- func (nav *Navigator) Datepicker(...) error
- func (nav *Navigator) EvaluateScript(script string) (interface{}, error)
- func (nav *Navigator) ExecuteScript(script string) error
- func (nav *Navigator) ExtractLinks() ([]string, error)
- func (nav *Navigator) FillField(selector string, value string) error
- func (nav *Navigator) FillForm(selector string, data map[string]string) error
- func (nav *Navigator) GetCurrentURL() (string, error)
- func (nav *Navigator) GetElement(selector string) (string, error)
- func (nav *Navigator) GetElementAttribute(selector, attribute string) (string, error)
- func (nav *Navigator) GetPageSource() (*html.Node, error)
- func (nav *Navigator) HandleAlert() error
- func (nav *Navigator) Login(url, username, password, usernameSelector, passwordSelector, ... string, ...) error
- func (nav *Navigator) LoginAccountsGoogle(email, password string) error
- func (nav *Navigator) LoginWithGoogle(url string) error
- func (nav *Navigator) MakeCaptchaElementVisible(selector string) error
- func (nav *Navigator) MakeElementVisible(selector string) error
- func (nav *Navigator) OpenURL(url string) error
- func (nav *Navigator) ReloadPage(retryCount int) error
- func (nav *Navigator) SaveImageBase64(selector, outputPath, prefixClean string) (string, error)
- func (nav *Navigator) SelectDropdown(selector, value string) error
- func (nav *Navigator) SetQueryType(queryType chromedp.QueryOption)
- func (nav *Navigator) SetTimeOut(timeOut time.Duration)
- func (nav *Navigator) SwitchToDefaultContent() error
- func (nav *Navigator) SwitchToFrame(selector string) error
- func (nav *Navigator) SwitchToNewTab() (*Navigator, error)
- func (nav *Navigator) UncheckRadioButton(selector string) error
- func (nav *Navigator) UnsafeClickButton(selector string) error
- func (nav *Navigator) UnsafeFillField(selector string, value string) error
- func (nav *Navigator) UseCSS()
- func (nav *Navigator) UseXPath()
- func (nav *Navigator) WaitForElement(selector string, timeout time.Duration) error
- func (nav *Navigator) WaitPageLoad() (string, error)
- type PageSource
- func EvaluateParallelRequests(previousResults []PageSource, crawlerFunc func(string) (*html.Node, error), ...) ([]PageSource, error)
- func ParallelRequests(requests []Request, numberOfWorkers int, delay time.Duration, ...) ([]PageSource, error)
- func RemovePageSource(slice []PageSource, s int) []PageSource
- type Request
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AskForString ¶ added in v1.4.0
AskForString prompts the user to enter a string and returns the trimmed input.
func ExtractTable ¶ added in v1.2.0
ExtractTable extracts data from a table specified by the selector. Example:
tableData, err := goSpider.ExtractTableData(pageSource,"#tableID")
func ExtractText ¶ added in v1.2.0
ExtractText extracts text content from nodes specified by the parent selectors. Example:
textData, err := goSpider.ExtractText(pageSource,"#parent1", "\n")
func FindNodes ¶ added in v1.2.0
FindNodes extracts nodes content from nodes specified by the parent selectors. Example:
nodeData, err := goSpider.FindNode(pageSource,"#parent1")
func GetElementAttributeFromNode ¶ added in v1.7.8
GetElementAttributeFromNode retrieves the value of a specified attribute from an element located using an XPath expression within a given HTML node. Parameters: - node: The root HTML node to search within. - xpathExpr: The XPath expression that identifies the target element. - attribute: The attribute name whose value you want to retrieve. Returns: - The attribute value as a string. - An error if the element or attribute cannot be found.
func ParseHtmlToString ¶ added in v1.7.8
ParseHtmlToString used for parsing html.node into string for debugging purposes
Types ¶
type Navigator ¶
type Navigator struct {
}
Navigator is a struct that holds the context for the ChromeDP session and a logger.
func NewNavigator ¶
NewNavigator creates a new Navigator instance.
Parameters:
- profilePath: the path to chrome profile defined by the user; can be passed as an empty string
- headless: if false will show chrome UI
Example:
nav := goSpider.NewNavigator("/Users/USER_NAME/Library/Application Support/Google/Chrome/Profile 2", true, initialCookies)
NewNavigator creates a new Navigator instance with enhanced logging for troubleshooting authentication issues.
func (*Navigator) CaptureScreenshot ¶
CaptureScreenshot captures a screenshot of the current browser window. Example:
err := nav.CaptureScreenshot("img")
func (*Navigator) CheckPageTitle ¶ added in v1.8.0
CheckPageTitle navigates to the provided URL and checks if the page title equals "Ah, não!". It returns true if the error title is detected, otherwise false.
func (*Navigator) CheckRadioButton ¶
CheckRadioButton selects a radio button specified by the selector. Example:
err := nav.CheckRadioButton("#radioButtonID")
func (*Navigator) ClickButton ¶
ClickButton clicks a button specified by the selector. Example:
err := nav.ClickButton("#buttonID")
func (*Navigator) ClickElement ¶
ClickElement clicks an element specified by the selector. Example:
err := nav.ClickElement("#elementID")
func (*Navigator) Close ¶
func (nav *Navigator) Close()
Close closes the Navigator instance and releases resources. Example:
nav.Close()
func (*Navigator) Datepicker ¶ added in v1.7.5
func (nav *Navigator) Datepicker(date, calendarButtonSelector, calendarButtonGoBack, calendarButtonsTableXpath, calendarButtonTR string) error
Datepicker deals with date-picker elements on websites by receiving a date, calculates the amount of time it needs to go back in the picker and finally selects a day.
date: string in the format "dd/mm/aaaa" calendarButtonSelector: the css selector of the data-picker calendarButtonGoBack: the css selector of the go back button calendarButtonsTableXpath: the xpath of the days table example: "//*[@id="ui-datepicker-div"]/table/tbody/tr"; calendarButtonTR: the css selector of the days table row, example: "//*[@id="ui-datepicker-div"]/table/tbody/tr"
func (*Navigator) EvaluateScript ¶ added in v1.3.0
EvaluateScript executes a JavaScript script and returns the result
func (*Navigator) ExecuteScript ¶ added in v1.3.0
ExecuteScript runs the specified JavaScript on the current page script: the JavaScript code to execute Returns an error if any
func (*Navigator) ExtractLinks ¶
ExtractLinks extracts all links from the current page. Example:
links, err := nav.ExtractLinks()
func (*Navigator) FillField ¶
FillField fills a field specified by the selector with the provided value. Example:
err := nav.FillField("#fieldID", "value")
func (*Navigator) FillForm ¶
FillForm fills out a form specified by the selector with the provided data and submits it. Example:
formData := map[string]string{
"username": "myUsername",
"password": "myPassword",
}
err := nav.FillForm("#loginForm", formData)
func (*Navigator) GetCurrentURL ¶
GetCurrentURL returns the current URL of the browser. Example:
currentURL, err := nav.GetCurrentURL()
func (*Navigator) GetElement ¶
GetElement retrieves the text content of an element specified by the selector. Example:
text, err := nav.GetElement("#elementID")
func (*Navigator) GetElementAttribute ¶ added in v1.7.0
GetElementAttribute retrieves the value of a specified attribute from an element identified by a CSS selector. Parameters: - selector: The CSS selector of the element. - attribute: The name of the attribute to retrieve the value of. Returns: - The value of the specified attribute. - An error if the attribute value could not be retrieved.
func (*Navigator) GetPageSource ¶ added in v1.2.0
GetPageSource captures all page HTML from the current page Returns the page HTML as a string and an error if any Example:
pageSource, err := nav.GetPageSource()
func (*Navigator) HandleAlert ¶
HandleAlert handles JavaScript alerts by accepting them. Example:
err := nav.HandleAlert()
func (*Navigator) Login ¶
func (nav *Navigator) Login(url, username, password, usernameSelector, passwordSelector, loginButtonSelector string, messageFailedSuccess string) error
Login logs into a website using the provided credentials and selectors. Example:
err := nav.Login("https://www.example.com/login", "username", "password", "#username", "#password", "#login-button", "#login-message-fail")
func (*Navigator) LoginAccountsGoogle ¶ added in v1.6.0
LoginAccountsGoogle performs the Google login on the given URL
func (*Navigator) LoginWithGoogle ¶ added in v1.4.0
LoginWithGoogle performs the Google login on the given URL
func (*Navigator) MakeCaptchaElementVisible ¶ added in v1.7.8
MakeCaptchaElementVisible changes the style display of an element to nil
func (*Navigator) MakeElementVisible ¶ added in v1.7.3
MakeElementVisible changes the style display of an element to nil
func (*Navigator) OpenURL ¶
OpenURL opens the specified URL in the current browser context. It will retry up to 3 times if the page title indicates an error ("Ah, não!"). Example:
err := nav.OpenURL("https://www.example.com")
func (*Navigator) ReloadPage ¶ added in v1.3.0
ReloadPage reloads the current page with retry logic retryCount: number of times to retry reloading the page in case of failure Returns an error if any
func (*Navigator) SaveImageBase64 ¶ added in v1.6.0
SaveImageBase64 extracts the base64 image data from the given selector and saves it to a file.
Parameters:
- selector: the CSS selector of the CAPTCHA image element
- outputPath: the file path to save the image
- prefixClean: the prefix to clear from the source, if any
Example:
err := nav.SaveImageBase64("#imagemCaptcha", "captcha.png", "data:image/png;base64,")
func (*Navigator) SelectDropdown ¶
SelectDropdown selects an option in a dropdown specified by the selector and value. Example:
err := nav.SelectDropdown("#dropdownID", "optionValue")
func (*Navigator) SetQueryType ¶ added in v1.8.1
func (nav *Navigator) SetQueryType(queryType chromedp.QueryOption)
SetQueryType defines selector type (CSS ou XPath)
func (*Navigator) SetTimeOut ¶ added in v1.6.4
SetTimeOut sets a timeout for all the waiting functions on the package. The standard timeout of the Navigator is 300 ms.
func (*Navigator) SwitchToDefaultContent ¶ added in v1.7.0
SwitchToDefaultContent switches the context back to the main content from an iframe context.
func (*Navigator) SwitchToFrame ¶ added in v1.7.0
SwitchToFrame switches the context to the specified iframe.
func (*Navigator) SwitchToNewTab ¶ added in v1.8.2
SwitchToNewTab returns the Navigator with a new context
func (*Navigator) UncheckRadioButton ¶
UncheckRadioButton unchecks a checkbox specified by the selector. Example:
err := nav.UncheckRadioButton("#checkboxID")
func (*Navigator) UnsafeClickButton ¶ added in v1.7.4
UnsafeClickButton clicks a button specified by the selector. Unsafe because this methode does not use the wait element feature. Example:
err := nav.ClickButton("#buttonID")
func (*Navigator) UnsafeFillField ¶ added in v1.7.4
UnsafeFillField fills a field specified by the selector with the provided value. Unsafe because this methode does not use the wait element feature. Example:
err := nav.FillField("#fieldID", "value")
func (*Navigator) WaitForElement ¶
WaitForElement waits for an element specified by the selector to be visible within the given timeout. Example:
err := nav.WaitForElement("#elementID", 5*time.Second)
func (*Navigator) WaitPageLoad ¶ added in v1.3.0
WaitPageLoad waits for the current page to fully load by checking the document.readyState property It will retry until the page is fully loaded or the timeout of one minute is reached Returns the page readyState as a string and an error if any
type PageSource ¶ added in v1.2.0
PageSource structure to hold the HTML data
func EvaluateParallelRequests ¶ added in v1.3.0
func EvaluateParallelRequests(previousResults []PageSource, crawlerFunc func(string) (*html.Node, error), evaluate func([]PageSource) ([]Request, []PageSource)) ([]PageSource, error)
EvaluateParallelRequests iterates over a set of previous results, evaluates them using the provided evaluation function, and handles re-crawling of problematic sources until all sources are valid or no further progress can be made.
Parameters: - previousResults: A slice of PageSource objects containing the initial crawl results. - crawlerFunc: A function that takes a string (URL or identifier) and returns a parsed HTML node and an error. - evaluate: A function that takes a slice of PageSource objects and returns two slices:
- A slice of Request objects for sources that need to be re-crawled.
- A slice of valid PageSource objects.
Returns: - A slice of valid PageSource objects after all problematic sources have been re-crawled and evaluated. - An error if there is a failure in the crawling process.
Example usage:
results, err := EvaluateParallelRequests(resultsFirst, Crawler, Eval)
func Eval(previousResults []PageSource) ([]Request, []PageSource) {
var newRequests []Request
var validResults []PageSource
for _, result := range previousResults {
_, err := extractDataCover(result.Page, "")
if err != nil {
newRequests = append(newRequests, Request{SearchString: result.Request})
} else {
validResults = append(validResults, result)
}
}
return newRequests, validResults
}
func ParallelRequests ¶ added in v1.1.0
func ParallelRequests(requests []Request, numberOfWorkers int, delay time.Duration, crawlerFunc func(string) (*html.Node, error)) ([]PageSource, error)
ParallelRequests performs web scraping tasks concurrently with a specified number of workers and a delay between requests. The crawlerFunc parameter allows for flexibility in defining the web scraping logic.
Parameters: - requests: A slice of Request structures containing the data needed for each request. - numberOfWorkers: The number of concurrent workers to process the requests. - delay: The delay duration between each request to avoid overwhelming the target server. - crawlerFunc: A user-defined function that takes a process number as input and returns the html as *html.Node, and an error.
Returns: - A slice of ResponseBody structures containing the results of the web scraping tasks. - An error if any occurred during the requests.
Example Usage:
results, err := ParallelRequests(requests, numberOfWorkers, delay, crawlerFunc)
func RemovePageSource ¶ added in v1.3.0
func RemovePageSource(slice []PageSource, s int) []PageSource
RemovePageSource removes the element at index `s` from a slice of `PageSource` objects. It returns the modified slice without the element at index `s`.
type Request ¶ added in v1.3.0
type Request struct {
SearchString string
}
Request structure to hold user data
func RemoveRequest ¶ added in v1.3.0
RemoveRequest removes the element at index `s` from a slice of `Request` objects. It returns the modified slice without the element at index `s`.