From 7a642d53f94dfad3faedf5ca48005d946793db11 Mon Sep 17 00:00:00 2001 From: hashir omer Date: Wed, 21 Sep 2022 09:31:50 +0500 Subject: [PATCH] Refactored code --- .gitignore | 4 +- go.mod | 5 +- main.go | 165 +------------------------------------- network/httpClient.go | 46 +++++++++++ upwork/upworkClient.go | 57 ++++++++++++++ upwork/upworkPipeline.go | 166 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 278 insertions(+), 165 deletions(-) create mode 100644 network/httpClient.go create mode 100644 upwork/upworkClient.go create mode 100644 upwork/upworkPipeline.go diff --git a/.gitignore b/.gitignore index 9ac7433..7cfcc6b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ data/ -all_jobs.json \ No newline at end of file +all_jobs.json + +*.json \ No newline at end of file diff --git a/go.mod b/go.mod index 7e51255..da264c3 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,10 @@ -module github.com/USERNAME/simple-go-service +module scrapers go 1.18 +require github.com/Danny-Dasilva/CycleTLS/cycletls v0.0.0-20220620102923-c84d740b4757 + require ( - github.com/Danny-Dasilva/CycleTLS/cycletls v0.0.0-20220620102923-c84d740b4757 github.com/Danny-Dasilva/fhttp v0.0.0-20220524230104-f801520157d6 // indirect github.com/Danny-Dasilva/utls v0.0.0-20220604023528-30cb107b834e // indirect github.com/andybalholm/brotli v1.0.4 // indirect diff --git a/main.go b/main.go index c14ba9d..4e02a42 100644 --- a/main.go +++ b/main.go @@ -1,174 +1,15 @@ package main import ( - "bufio" - "encoding/json" - "fmt" - "os" - "path/filepath" - "time" - - "github.com/Danny-Dasilva/CycleTLS/cycletls" + "scrapers/upwork" ) -func get_data(url string, headers map[string]string) (string, error) { - client := cycletls.Init() - response, err := client.Do( - url, - cycletls.Options{ - Body: "", - Ja3: "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0", - UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0", - Headers: headers, - }, - "GET", - ) - - if err == nil { - return string(response.Body), nil - } - return response.Body, nil - -} - -func save_to_file(data string, filename string) error { - file, err := os.Create(filename) - if err != nil { - return err - } - defer file.Close() - - writer := bufio.NewWriter(file) - _, err = writer.WriteString(data) - if err != nil { - return err - } - - writer.Flush() - return nil -} - -// func check_captcha(data map[string]string) bool { -// if _, ok := data["blockScript"]; ok { -// return true -// } -// return false - -// } - func main() { - os.RemoveAll("data") - - os.Mkdir("data", 0755) - - headers := map[string]string{ - "authority": "www.upwork.com", - "accept": "application/json, text/plain", - "accept-language": "en", - "cache-control": "no-cache", - "pragma": "no-cache", - "referer": "https://www.upwork.com/search/jobs/url?per_page=10&sort=recency", - "sec-fetch-site": "same-origin", - "sec-gpc": "1", - "vnd-eo-parent-span-id": "2724011d-2430-47f5-b5b9-603f2e919685", - "vnd-eo-span-id": "9d6e5b36-ace2-402e-a188-01da1d6b84ee", - "x-odesk-user-agent": "oDesk LM", - "x-requested-with": "XMLHttpRequest", - } - //Upwork limits pagination to 100 pages - total_iterations := 10 - //Query to serach for on Upwork, searching for jobs with shopify keyword - query := "shopify" - //Number of results per page - per_page := 100 - - start := 1 - for i := start; i <= total_iterations; i++ { - fmt.Print("Iteration: ", i, "\n") - upwork_api_url_template := "https://www.upwork.com/search/jobs/url?q=%s&per_page=%d&sort=recency&page=%d" - url := fmt.Sprintf(upwork_api_url_template, query, per_page, i) - - time.Sleep(2 * time.Second) - - data, err := get_data(url, headers) - if err != nil { - fmt.Println(err) - fmt.Println(i) - panic(err) - - } - - //Get body of the response - filename := fmt.Sprintf("data/%d.json", i) - err = save_to_file(data, filename) - if err != nil { - fmt.Println(err) - panic(err) - } - } - - fmt.Println("Scraping done") - - files, err := filepath.Glob("data/*.json") + p := upwork.InitPipeline() + err := p.Run("Shopify") if err != nil { panic(err) } - var all_jobs []map[string]interface{} - for _, file := range files { - fmt.Println(file) - data, err := os.ReadFile(file) - if err != nil { - panic(err) - } - - //Parse data as json without interface - var result map[string]interface{} - err = json.Unmarshal(data, &result) - if err != nil { - panic(err) - } - - //Get value from key - key := "searchResults" - value := result[key] - - //Check for errors - is_error := value.(map[string]interface{})["jobSearchError"] - - //Skip the file if is_error is True - - if is_error == true { - fmt.Println("Error") - continue - } - - //Get jobs from the json - jobs := value.(map[string]interface{})["jobs"] - - //Add all jobs to the all_jobs slice - for _, job := range jobs.([]interface{}) { - all_jobs = append(all_jobs, job.(map[string]interface{})) - } - - } - - jobs := map[string]interface{}{ - "jobs": all_jobs, - } - - json_data, err := json.Marshal(jobs) - - if err != nil { - panic(err) - } - - err = save_to_file(string(json_data), "all_jobs.json") - if err != nil { - panic(err) - } - - os.RemoveAll("data") - } diff --git a/network/httpClient.go b/network/httpClient.go new file mode 100644 index 0000000..ee6b562 --- /dev/null +++ b/network/httpClient.go @@ -0,0 +1,46 @@ +package network + +import "github.com/Danny-Dasilva/CycleTLS/cycletls" + +type Client struct { + httpClient cycletls.CycleTLS + options cycletls.Options +} + +func InitClient(headers map[string]string) *Client { + + client := cycletls.Init() + options := cycletls.Options{ + Body: "", + Ja3: "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0", + UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0", + Headers: headers, + } + + httpClient := Client{ + httpClient: client, + options: options, + } + + return &httpClient + +} + +func (c Client) GetRequest(url string) (cycletls.Response, error) { + var res cycletls.Response + response, error := c.httpClient.Do(url, c.options, "GET") + if error != nil { + return res, error + } + return response, nil +} + +func (c Client) PostRequest(url string, body string) (cycletls.Response, error) { + var res cycletls.Response + c.options.Body = body + response, error := c.httpClient.Do(url, c.options, "POST") + if error != nil { + return res, error + } + return response, nil +} diff --git a/upwork/upworkClient.go b/upwork/upworkClient.go new file mode 100644 index 0000000..cd42f13 --- /dev/null +++ b/upwork/upworkClient.go @@ -0,0 +1,57 @@ +package upwork + +import ( + "fmt" + "scrapers/network" +) + +type Upwork struct { + UpworkHttpClient *network.Client +} + +type UrlArgs struct { + Page int + Per_Page int + Query string +} + +func (u Upwork) ConstructUrl(args UrlArgs) string { + url := "https://www.upwork.com/search/jobs/url?q=%s&per_page=%d&sort=recency&page=%d" + + return fmt.Sprintf(url, args.Query, args.Per_Page, args.Page) +} + +func (u Upwork) SendRequest(url string) (string, error) { + Upclient := u.UpworkHttpClient + resp, err := Upclient.GetRequest(url) + + if err != nil { + return "", err + } + return resp.Body, nil + +} + +func InitUpwork() *Upwork { + headers := map[string]string{ + "authority": "www.upwork.com", + "accept": "application/json, text/plain", + "accept-language": "en", + "cache-control": "no-cache", + "pragma": "no-cache", + "referer": "https://www.upwork.com/search/jobs/url?per_page=10&sort=recency", + "sec-fetch-site": "same-origin", + "sec-gpc": "1", + "vnd-eo-parent-span-id": "2724011d-2430-47f5-b5b9-603f2e919685", + "vnd-eo-span-id": "9d6e5b36-ace2-402e-a188-01da1d6b84ee", + "x-odesk-user-agent": "oDesk LM", + "x-requested-with": "XMLHttpRequest", + } + client := network.InitClient(headers) + upwork := Upwork{ + UpworkHttpClient: client, + } + + return &upwork + +} diff --git a/upwork/upworkPipeline.go b/upwork/upworkPipeline.go new file mode 100644 index 0000000..ed264a6 --- /dev/null +++ b/upwork/upworkPipeline.go @@ -0,0 +1,166 @@ +package upwork + +import ( + "encoding/json" + "fmt" + "log" + "os" + "path/filepath" +) + +type UpworkPipeLine struct { + upworkClient *Upwork + iterations int +} + +func InitPipeline() *UpworkPipeLine { + u := UpworkPipeLine{ + upworkClient: InitUpwork(), + iterations: 5, + } + return &u +} + +func (u *UpworkPipeLine) CombineFiles() error { + + var all_jobs []interface{} + all_filenames, err := filepath.Glob("data/*.json") + if err != nil { + panic(err) + } + + for _, file := range all_filenames { + data, err := os.ReadFile(file) + if err != nil { + panic(err) + } + var result map[string]interface{} + + //Parse data as json + err = json.Unmarshal([]byte(data), &result) + if err != nil { + panic(err) + } + + key := "searchResults" + value := result[key] + jobs := value.(map[string]interface{})["jobs"].([]interface{}) + for _, job := range jobs { + all_jobs = append(all_jobs, job.(map[string]interface{})) + } + + //save to file + filename := "all_jobs.json" + + //Convert to json + json_data, err := json.Marshal(all_jobs) + if err != nil { + panic(err) + } + err = u.saveToFile(json_data, filename) + if err != nil { + panic(err) + } + + } + return nil +} + +func (u *UpworkPipeLine) saveToFile(data []byte, filename string) error { + + err := os.WriteFile(filename, data, 0644) + if err != nil { + return err + } + return nil + +} + +func isJSON(s string) bool { + var js map[string]interface{} + return json.Unmarshal([]byte(s), &js) == nil + +} + +func isApiError(data string) bool { + var result map[string]interface{} + + //Parse data as json + err := json.Unmarshal([]byte(data), &result) + if err != nil { + panic(err) + } + + //Get value from key + key := "searchResults" + value := result[key] + + //Check for errors + is_error := value.(map[string]interface{})["jobSearchError"] + + return is_error == true +} + +func (u *UpworkPipeLine) validateResponse(data string) bool { + return isJSON(data) && !isApiError(data) + +} + +func (u *UpworkPipeLine) handleRequest(urlArgs UrlArgs, iteration int) { + client := u.upworkClient + url := client.ConstructUrl(urlArgs) + resp, err := u.upworkClient.SendRequest(url) + if err != nil { + log.Fatal(err) + + //check if response is valid + } else if u.validateResponse(resp) { + filename := fmt.Sprintf("data/%d.json", iteration) + // Convert resp to array of bytes + err = u.saveToFile([]byte(resp), filename) + if err != nil { + log.Println(err) + } + } else { + log.Println("Invalid response returned") + + } + +} + +func (u *UpworkPipeLine) Run(query string) error { + + err := os.RemoveAll("data") + if err != nil { + return err + } + err = os.Mkdir("data", 0755) + if err != nil { + return err + } + + var iteration int + for iteration = 1; iteration <= u.iterations; iteration++ { + log.Println("Iteration: ", iteration) + urlArgs := UrlArgs{ + Page: iteration, + Per_Page: 100, + Query: "Shopify", + } + //It is possible to use a go routine here but be nice to the api or you will be rate limited pretty quickly. It is technically possible to circumvent it using a proxy but it is not recommended. + // go u.handleRequest(urlArgs, i) + u.handleRequest(urlArgs, iteration) + + } + + err = u.CombineFiles() + os.RemoveAll("data") + + if err != nil { + panic(err) + + } + + return nil + +}