I am making a web crawler. I'm passing the url through a crawler function and parsing it to get all the links in the anchor tag, then I am invoking same crawler function for all those urls using seperate goroutine for every url.
But if if send a request and cancel it before I get the response, all the groutines for that particular request are still running.
Now what I want is that when I cancel the request all the goroutines that got invoked due to that request stops.
Please guide.
Following is my code for the crawler function.
func crawler(c echo.Context, urlRec string, feed chan string, urlList *[]string, wg *sync.WaitGroup) {
defer wg.Done()
URL, _ := url.Parse(urlRec)
response, err := http.Get(urlRec)
if err != nil {
log.Print(err)
return
}
body := response.Body
defer body.Close()
tokenizer := html.NewTokenizer(body)
flag := true
for flag {
tokenType := tokenizer.Next()
switch {
case tokenType == html.ErrorToken:
flag = false
break
case tokenType == html.StartTagToken:
token := tokenizer.Token()
// Check if the token is an <a> tag
isAnchor := token.Data == "a"
if !isAnchor {
continue
}
ok, urlHref := getReference(token)
if !ok {
continue
}
// Make sure the url begines in http**
hasProto := strings.Index(urlHref, "http") == 0
if hasProto {
if !urlInURLList(urlHref, urlList) {
if strings.Contains(urlHref, URL.Host) {
*urlList = append(*urlList, urlHref)
// fmt.Println(urlHref)
// c.String(http.StatusOK, urlHref+"\n")Documents
if !checkExt(filepath.Ext(urlHref)) {
wg.Add(1)
go crawler(c, urlHref, feed, urlList, wg)
}
}
}
}
}
}
}
And following is my POST request handler
func scrapePOST(c echo.Context) error {
var urlList []string
urlSession := urlFound{}
var wg sync.WaitGroup
urlParam := c.FormValue("url")
feed := make(chan string, 1000)
wg.Add(1)
go crawler(c, urlParam, feed, &urlList, &wg)
wg.Wait()
var count = 0
for _, url := range urlList {
if filepath.Ext(url) == ".jpg" || filepath.Ext(url) == ".jpeg" || filepath.Ext(url) == ".png" {
urlSession.Images = append(urlSession.Images, url)
} else if filepath.Ext(url) == ".doc" || filepath.Ext(url) == ".docx" || filepath.Ext(url) == ".pdf" || filepath.Ext(url) == ".ppt" {
urlSession.Documents = append(urlSession.Documents, url)
} else {
urlSession.Links = append(urlSession.Links, url)
}
count = count + 1
}
urlSession.Count = count
// jsonResp, _ := json.Marshal(urlSession)
// fmt.Print(urlSession)
return c.JSON(http.StatusOK, urlSession)
}