A simple concurrent web crawler written in Go
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

crawler.go 2.1KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. package main
  2. import (
  3. "flag"
  4. "fmt"
  5. "io/ioutil"
  6. "net/http"
  7. "sync"
  8. "time"
  9. )
  10. func request(tasks <-chan int, results chan<- int, worker int, wg *sync.WaitGroup, url *string) {
  11. //The for loop is important as the worker will continue to work.
  12. for task := range tasks {
  13. res, err := http.DefaultClient.Get(fmt.Sprintf("%s/%v", *url, task))
  14. if err != nil {
  15. fmt.Printf("error occured : %v \n", err)
  16. }
  17. defer res.Body.Close()
  18. body, err := ioutil.ReadAll(res.Body)
  19. if err != nil {
  20. fmt.Printf("unable to parse %v : %v \n", url, res)
  21. }
  22. //Write the result to the results channel.
  23. results <- len(body)
  24. }
  25. //Decrement the semaphore value.
  26. wg.Done()
  27. }
  28. func main() {
  29. url := flag.String("url", "http://localhost/tests", "The URL that should be crawled")
  30. minID := flag.Int("min-id", 0, "The minimum ID value")
  31. maxID := flag.Int("max-id", 20, "The maximum ID value")
  32. concurrency := flag.Int("concurrency", 10, "The number of concurrent tasks")
  33. flag.Parse()
  34. startTime := time.Now()
  35. //Create the channels.
  36. tasks := make(chan int)
  37. results := make(chan int)
  38. //Create a waiting group.
  39. var wg sync.WaitGroup
  40. //Set the semaphore value to the concurrency level.
  41. wg.Add(*concurrency)
  42. //Create new workers. This must be done before writing tasks to the channel.
  43. //Otherwise, the channel will block the current main goroutine and deadlock.
  44. for worker := 0; worker < *concurrency; worker++ {
  45. go request(tasks, results, worker, &wg, url)
  46. }
  47. //Create new tasks and write them to the channel.
  48. for task := *minID; task < *maxID; task++ {
  49. tasks <- task
  50. fmt.Printf("The number of bytes fetched is %v \n", <-results)
  51. }
  52. //Close the channel. Close should be called after each task is assigned to the channel.
  53. //Otherwise it will block.
  54. close(tasks)
  55. //It does not block when the results channel is not closed, but, for safety, I close
  56. //the read-only channel as well.
  57. close(results)
  58. //Wait until the semaphore will become zero. This should be called after the channel is closed.
  59. wg.Wait()
  60. endTime := time.Now()
  61. fmt.Printf("time difference is %v \n", endTime.Sub(startTime))
  62. }