간단하게 echo 서버를 이용한 샘플입니다. Scrapping 관련 마무리 샘플입니다.
echo 패키지 설치
D:\workspace\GO\nomad>go get github.com/labstack/echo
go: downloading github.com/labstack/echo v1.4.4
go: downloading github.com/labstack/echo v3.3.10+incompatible
go: downloading github.com/labstack/gommon v0.3.0
go: downloading golang.org/x/crypto v0.0.0-20210921155107-089bfa567519
go: downloading github.com/mattn/go-colorable v0.1.2
go: downloading github.com/mattn/go-isatty v0.0.9
go: downloading github.com/valyala/fasttemplate v1.0.1
go: downloading golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1
go: downloading github.com/valyala/bytebufferpool v1.0.0
go: downloading golang.org/x/text v0.3.6
go get: added github.com/labstack/echo v3.3.10+incompatible
go get: added github.com/labstack/gommon v0.3.0
go get: added golang.org/x/crypto v0.0.0-20210921155107-089bfa567519
main.go
package main
import (
"GO/nomad/scrapper"
"fmt"
"os"
"strings"
"github.com/labstack/echo"
)
const File_Name string = "jobs.csv"
func handleHome(c echo.Context) error {
return c.File("home.html")
}
func handleScrape(c echo.Context) error {
defer os.Remove(File_Name)
fmt.Println("term:", c.FormValue("term"))
term := strings.ToLower(scrapper.CleanString(c.FormValue("term")))
scrapper.Scrape(term)
return c.Attachment(File_Name, File_Name)
}
func main() {
e := echo.New()
e.GET("/", handleHome)
e.POST("scrape", handleScrape)
e.Logger.Fatal(e.Start(":1323"))
}
home.html - POST 방식으로 "scrape" 페이지로 데이터 전달, 서버단에서는 handleScrape 함수가 호출됨.
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Go Jobs</title>
</head>
<body>
<h1>Go Jobs</h1>
<h3>Indeed.com scrapper</h3>
<form method="POST" action="/scrape">
<input placeholder="what job do you want", name="term">
<button>Search</button>
</form>
</body>
</html>
./scrapper/scrapper.go - https://kr.indeed.com 페이지를 크롤링하여 필요한 데이터를 추출함.
package scrapper
import (
"encoding/csv"
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"net/http"
"os"
"strconv"
"strings"
)
type extractedJob struct {
id string
title string
location string
salary string
summary string
}
func Scrape(term string) {
var baseURL string = "https://kr.indeed.com/jobs?q=" + term + "&limit=50"
var jobs []extractedJob
ch := make(chan []extractedJob)
totalPages := getPages(baseURL)
fmt.Println(totalPages)
for i := 0; i < totalPages; i++ {
go getPage(i, baseURL, ch)
}
for i := 0; i < totalPages; i++ {
job := <-ch
jobs = append(jobs, job...)
}
writeJobs(jobs)
fmt.Println("Done, extraced", len(jobs))
}
func getPage(page int, url string, mainCh chan<- []extractedJob) {
var jobs []extractedJob
ch := make(chan extractedJob)
pageURL := url + "&start=" + strconv.Itoa(page*50)
fmt.Println("requesting:", pageURL)
res, err := http.Get(pageURL)
checkErr(err)
checkCode(res)
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)
searchCards := doc.Find(".tapItem")
searchCards.Each(func(i int, card *goquery.Selection) {
go extractJob(card, ch)
})
for i := 0; i < searchCards.Length(); i++ {
job := <-ch
jobs = append(jobs, job)
}
mainCh <- jobs
}
func writeJobs(jobs []extractedJob) {
file, err := os.Create("jobs.csv")
checkErr(err)
utf8bom := []byte{0xEF, 0xBB, 0xBF}
file.Write(utf8bom)
w := csv.NewWriter(file)
defer w.Flush()
headers := []string{"ID", "Title", "Location", "Salary", "Summary"}
wErr := w.Write(headers)
checkErr(wErr)
for _, job := range jobs {
jobSlice := []string{
"https://kr.indeed.com/viewjob?jk=" + job.id,
job.title,
job.location,
job.salary,
job.summary}
jwErr := w.Write(jobSlice)
checkErr(jwErr)
}
}
func extractJob(card *goquery.Selection, ch chan<- extractedJob) {
id, _ := card.Attr("data-jk")
title := CleanString(card.Find(".jobTitle>span").Text())
location := CleanString(card.Find(".companyLocation").Text())
salary := CleanString(card.Find(".salary-snippet").Text())
summary := CleanString(card.Find(".job-snippet").Text())
ch <- extractedJob{
id: id,
title: title,
location: location,
salary: salary,
summary: summary,
}
}
func CleanString(str string) string {
return strings.Join(strings.Fields(strings.TrimSpace(str)), " ")
}
func getPages(url string) int {
pages := 0
res, err := http.Get(url)
checkErr(err)
checkCode(res)
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)
doc.Find(".pagination").Each(func(i int, s *goquery.Selection) {
pages = s.Find("a").Length()
})
return pages
}
func checkErr(err error) {
if err != nil {
log.Fatalln(err)
}
}
func checkCode(res *http.Response) {
if res.StatusCode != 200 {
log.Fatalln("Request failed with Status:", res.StatusCode, res.Status)
}
}
서버를 실행하고 http://localhost:1323/ 에 접속하면 아래와 같다. 검색어를 입력하고 검색버튼을 클릭한다.
검색 결과가 웹브라우저 다운로드 폴더에 자동(컴퓨터 설정에 따라 수동으로)으로 저장된다.
참고자료 [https://youtu.be/MRrx8Lk1wOI]
'GO lang' 카테고리의 다른 글
[GO] Decorator - 심화(log pattern) (0) | 2021.11.12 |
---|---|
[GO] Decorator 패턴 (0) | 2021.11.06 |
[GO] Scrapping(1) - URL checker (0) | 2021.10.07 |
[GO] channel (0) | 2021.10.07 |
[GO] 고루틴 (0) | 2021.10.07 |