Решение на Concurrent Crawling от Йордан Пулов

Обратно към всички решения

Към профила на Йордан Пулов

Резултати

  • 7 точки от тестове
  • 0 бонус точки
  • 7 точки общо
  • 8 успешни тест(а)
  • 3 неуспешни тест(а)

Код

package main
import (
"errors"
"fmt"
"io/ioutil"
"net/http"
"time"
)
func SeekAndDestroy(callback func(string) bool, chunkedUrlsToCheck <-chan []string, workersCount int) (string, error) {
// wrong numbers of workers
if workersCount <= 0 {
return "", errors.New("Workers are less than 1")
}
var urlsQueue []string
currentUrl := make(chan string)
gotIt := make(chan string)
closedChan := make(chan struct{})
// buffering the chans , cannot make it without another goroute
go func() {
for {
if len(urlsQueue) > 0 {
select {
case currentUrl <- urlsQueue[0]: // reading from the buffer
urlsQueue = urlsQueue[1:]
case urls, ok := <-chunkedUrlsToCheck: // writing in the buffer
if ok == false {
closedChan <- struct{}{}
return
}
for _, url := range urls {
urlsQueue = append(urlsQueue, url)
}
}
} else {
// insertion of the first array with urls
urls, ok := <-chunkedUrlsToCheck
if ok == false {
closedChan <- struct{}{}
return
}
for _, url := range urls {
urlsQueue = append(urlsQueue, url)
}
}
}
}()
// spawn all workers at once .... not good I know :(
for i := 0; i < workersCount; i++ {
go func() {
for {
// get the next url
url := <-currentUrl
resp, urlError := http.Get(url)
// check for url or status code
if urlError != nil || resp.StatusCode/100 != 2 {
break
}
// read the HTML
html, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
// problems in parsing the HTML
if err != nil {
break
}
//check for the callback
if callback(fmt.Sprintf("%s", html)) {
gotIt <- url
return
}
}
}()
}
// waiting for the result
select {
case url := <-gotIt:
// when the parent dies ... all children die as well
return url, nil
case <-time.After(15 * time.Second):
return "", errors.New("Time expired")
case <-closedChan:
return "", errors.New("Closed Chan")
}
}

Лог от изпълнението

▸ Покажи лога

История (1 версия и 1 коментар)

Йордан обнови решението на 10.12.2014 22:48 (преди над 3 години)

▸ Покажи разликите
+package main
+
+import (
+ "errors"
+ "fmt"
+ "io/ioutil"
+ "net/http"
+ "time"
+)
+
+func SeekAndDestroy(callback func(string) bool, chunkedUrlsToCheck <-chan []string, workersCount int) (string, error) {
+ // wrong numbers of workers
+ if workersCount <= 0 {
+ return "", errors.New("Workers are less than 1")
+ }
+
+ var urlsQueue []string
+ currentUrl := make(chan string)
+ gotIt := make(chan string)
+ closedChan := make(chan struct{})
+
+ // buffering the chans , cannot make it without another goroute
+ go func() {
+ for {
+ if len(urlsQueue) > 0 {
+ select {
+ case currentUrl <- urlsQueue[0]: // reading from the buffer
+ urlsQueue = urlsQueue[1:]
+ case urls, ok := <-chunkedUrlsToCheck: // writing in the buffer
+ if ok == false {
+ closedChan <- struct{}{}
+ return
+ }
+ for _, url := range urls {
+ urlsQueue = append(urlsQueue, url)
+ }
+ }
+ } else {
+ // insertion of the first array with urls
+ urls, ok := <-chunkedUrlsToCheck
+ if ok == false {
+ closedChan <- struct{}{}
+ return
+ }
+ for _, url := range urls {
+ urlsQueue = append(urlsQueue, url)
+ }
+
+ }
+ }
+ }()
+
+ // spawn all workers at once .... not good I know :(
+ for i := 0; i < workersCount; i++ {
+ go func() {
+ for {
+ // get the next url
+ url := <-currentUrl
+ resp, urlError := http.Get(url)
+
+ // check for url or status code
+ if urlError != nil || resp.StatusCode/100 != 2 {
+ break
+ }
+
+ // read the HTML
+ html, err := ioutil.ReadAll(resp.Body)
+ resp.Body.Close()
+
+ // problems in parsing the HTML
+ if err != nil {
+ break
+ }
+
+ //check for the callback
+ if callback(fmt.Sprintf("%s", html)) {
+ gotIt <- url
+ return
+ }
+ }
+
+ }()
+ }
+
+ // waiting for the result
+ select {
+ case url := <-gotIt:
+ // when the parent dies ... all children die as well
+ return url, nil
+ case <-time.After(15 * time.Second):
+ return "", errors.New("Time expired")
+ case <-closedChan:
+ return "", errors.New("Closed Chan")
+ }
+}