본문 바로가기

GO lang

[GO] Scrapping(2) - Echo server

간단하게 echo 서버를 이용한 샘플입니다. Scrapping 관련 마무리 샘플입니다.

 

echo 패키지 설치

 

D:\workspace\GO\nomad>go get github.com/labstack/echo
go: downloading github.com/labstack/echo v1.4.4
go: downloading github.com/labstack/echo v3.3.10+incompatible
go: downloading github.com/labstack/gommon v0.3.0
go: downloading golang.org/x/crypto v0.0.0-20210921155107-089bfa567519
go: downloading github.com/mattn/go-colorable v0.1.2
go: downloading github.com/mattn/go-isatty v0.0.9     
go: downloading github.com/valyala/fasttemplate v1.0.1
go: downloading golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1
go: downloading github.com/valyala/bytebufferpool v1.0.0
go: downloading golang.org/x/text v0.3.6
go get: added github.com/labstack/echo v3.3.10+incompatible
go get: added github.com/labstack/gommon v0.3.0
go get: added golang.org/x/crypto v0.0.0-20210921155107-089bfa567519

 

main.go

 

package main

import (
	"GO/nomad/scrapper"
	"fmt"
	"os"
	"strings"
	"github.com/labstack/echo"
)

const File_Name string = "jobs.csv"

func handleHome(c echo.Context) error {
	return c.File("home.html")
}
func handleScrape(c echo.Context) error {
	defer os.Remove(File_Name)
	fmt.Println("term:", c.FormValue("term"))
	term := strings.ToLower(scrapper.CleanString(c.FormValue("term")))
	scrapper.Scrape(term)
	return c.Attachment(File_Name, File_Name)
}

func main() {
	e := echo.New()
	e.GET("/", handleHome)
	e.POST("scrape", handleScrape)
	e.Logger.Fatal(e.Start(":1323"))
}

 

home.html - POST 방식으로 "scrape" 페이지로 데이터 전달, 서버단에서는 handleScrape 함수가 호출됨.

 

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Go Jobs</title>
</head>
<body>
    <h1>Go Jobs</h1>
    <h3>Indeed.com scrapper</h3>
    <form method="POST" action="/scrape">
        <input placeholder="what job do you want", name="term">
        <button>Search</button>
    </form>
</body>
</html>

 

./scrapper/scrapper.go - https://kr.indeed.com 페이지를 크롤링하여 필요한 데이터를 추출함.

 

package scrapper

import (
	"encoding/csv"
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
)

type extractedJob struct {
	id       string
	title    string
	location string
	salary   string
	summary  string
}

func Scrape(term string) {

	var baseURL string = "https://kr.indeed.com/jobs?q=" + term + "&limit=50"
	var jobs []extractedJob
	ch := make(chan []extractedJob)
	totalPages := getPages(baseURL)
	fmt.Println(totalPages)

	for i := 0; i < totalPages; i++ {
		go getPage(i, baseURL, ch)
	}

	for i := 0; i < totalPages; i++ {
		job := <-ch
		jobs = append(jobs, job...)
	}

	writeJobs(jobs)
	fmt.Println("Done, extraced", len(jobs))
}

func getPage(page int, url string, mainCh chan<- []extractedJob) {

	var jobs []extractedJob
	ch := make(chan extractedJob)
	pageURL := url + "&start=" + strconv.Itoa(page*50)
	fmt.Println("requesting:", pageURL)
	res, err := http.Get(pageURL)
	checkErr(err)
	checkCode(res)

	defer res.Body.Close()
	doc, err := goquery.NewDocumentFromReader(res.Body)
	checkErr(err)

	searchCards := doc.Find(".tapItem")
	searchCards.Each(func(i int, card *goquery.Selection) {
		go extractJob(card, ch)
	})

	for i := 0; i < searchCards.Length(); i++ {
		job := <-ch
		jobs = append(jobs, job)
	}

	mainCh <- jobs

}

func writeJobs(jobs []extractedJob) {
	file, err := os.Create("jobs.csv")
	checkErr(err)
	utf8bom := []byte{0xEF, 0xBB, 0xBF}
	file.Write(utf8bom)

	w := csv.NewWriter(file)
	defer w.Flush()

	headers := []string{"ID", "Title", "Location", "Salary", "Summary"}

	wErr := w.Write(headers)
	checkErr(wErr)

	for _, job := range jobs {
		jobSlice := []string{
			"https://kr.indeed.com/viewjob?jk=" + job.id,
			job.title,
			job.location,
			job.salary,
			job.summary}
		jwErr := w.Write(jobSlice)
		checkErr(jwErr)

	}

}

func extractJob(card *goquery.Selection, ch chan<- extractedJob) {
	id, _ := card.Attr("data-jk")
	title := CleanString(card.Find(".jobTitle>span").Text())
	location := CleanString(card.Find(".companyLocation").Text())
	salary := CleanString(card.Find(".salary-snippet").Text())
	summary := CleanString(card.Find(".job-snippet").Text())

	ch <- extractedJob{
		id:       id,
		title:    title,
		location: location,
		salary:   salary,
		summary:  summary,
	}

}

func CleanString(str string) string {
	return strings.Join(strings.Fields(strings.TrimSpace(str)), " ")
}

func getPages(url string) int {
	pages := 0
	res, err := http.Get(url)
	checkErr(err)
	checkCode(res)

	defer res.Body.Close()
	doc, err := goquery.NewDocumentFromReader(res.Body)
	checkErr(err)

	doc.Find(".pagination").Each(func(i int, s *goquery.Selection) {
		pages = s.Find("a").Length()
	})

	return pages
}

func checkErr(err error) {
	if err != nil {
		log.Fatalln(err)
	}
}

func checkCode(res *http.Response) {
	if res.StatusCode != 200 {
		log.Fatalln("Request failed with Status:", res.StatusCode, res.Status)
	}
}

 

서버를 실행하고 http://localhost:1323/ 에 접속하면 아래와 같다. 검색어를 입력하고 검색버튼을 클릭한다.

 

 

검색 결과가 웹브라우저 다운로드 폴더에 자동(컴퓨터 설정에 따라 수동으로)으로 저장된다.

 

 

 

참고자료 [https://youtu.be/MRrx8Lk1wOI]

'GO lang' 카테고리의 다른 글

[GO] Decorator - 심화(log pattern)  (0) 2021.11.12
[GO] Decorator 패턴  (0) 2021.11.06
[GO] Scrapping(1) - URL checker  (0) 2021.10.07
[GO] channel  (0) 2021.10.07
[GO] 고루틴  (0) 2021.10.07