Refactored code

This commit is contained in:
hashir omer
2022-09-21 09:31:50 +05:00
parent a12b015d39
commit 7a642d53f9
6 changed files with 278 additions and 165 deletions

4
.gitignore vendored
View File

@@ -1,4 +1,6 @@
data/
all_jobs.json
all_jobs.json
*.json

5
go.mod
View File

@@ -1,9 +1,10 @@
module github.com/USERNAME/simple-go-service
module scrapers
go 1.18
require github.com/Danny-Dasilva/CycleTLS/cycletls v0.0.0-20220620102923-c84d740b4757
require (
github.com/Danny-Dasilva/CycleTLS/cycletls v0.0.0-20220620102923-c84d740b4757
github.com/Danny-Dasilva/fhttp v0.0.0-20220524230104-f801520157d6 // indirect
github.com/Danny-Dasilva/utls v0.0.0-20220604023528-30cb107b834e // indirect
github.com/andybalholm/brotli v1.0.4 // indirect

165
main.go
View File

@@ -1,174 +1,15 @@
package main
import (
"bufio"
"encoding/json"
"fmt"
"os"
"path/filepath"
"time"
"github.com/Danny-Dasilva/CycleTLS/cycletls"
"scrapers/upwork"
)
func get_data(url string, headers map[string]string) (string, error) {
client := cycletls.Init()
response, err := client.Do(
url,
cycletls.Options{
Body: "",
Ja3: "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0",
UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0",
Headers: headers,
},
"GET",
)
if err == nil {
return string(response.Body), nil
}
return response.Body, nil
}
func save_to_file(data string, filename string) error {
file, err := os.Create(filename)
if err != nil {
return err
}
defer file.Close()
writer := bufio.NewWriter(file)
_, err = writer.WriteString(data)
if err != nil {
return err
}
writer.Flush()
return nil
}
// func check_captcha(data map[string]string) bool {
// if _, ok := data["blockScript"]; ok {
// return true
// }
// return false
// }
func main() {
os.RemoveAll("data")
os.Mkdir("data", 0755)
headers := map[string]string{
"authority": "www.upwork.com",
"accept": "application/json, text/plain",
"accept-language": "en",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://www.upwork.com/search/jobs/url?per_page=10&sort=recency",
"sec-fetch-site": "same-origin",
"sec-gpc": "1",
"vnd-eo-parent-span-id": "2724011d-2430-47f5-b5b9-603f2e919685",
"vnd-eo-span-id": "9d6e5b36-ace2-402e-a188-01da1d6b84ee",
"x-odesk-user-agent": "oDesk LM",
"x-requested-with": "XMLHttpRequest",
}
//Upwork limits pagination to 100 pages
total_iterations := 10
//Query to serach for on Upwork, searching for jobs with shopify keyword
query := "shopify"
//Number of results per page
per_page := 100
start := 1
for i := start; i <= total_iterations; i++ {
fmt.Print("Iteration: ", i, "\n")
upwork_api_url_template := "https://www.upwork.com/search/jobs/url?q=%s&per_page=%d&sort=recency&page=%d"
url := fmt.Sprintf(upwork_api_url_template, query, per_page, i)
time.Sleep(2 * time.Second)
data, err := get_data(url, headers)
if err != nil {
fmt.Println(err)
fmt.Println(i)
panic(err)
}
//Get body of the response
filename := fmt.Sprintf("data/%d.json", i)
err = save_to_file(data, filename)
if err != nil {
fmt.Println(err)
panic(err)
}
}
fmt.Println("Scraping done")
files, err := filepath.Glob("data/*.json")
p := upwork.InitPipeline()
err := p.Run("Shopify")
if err != nil {
panic(err)
}
var all_jobs []map[string]interface{}
for _, file := range files {
fmt.Println(file)
data, err := os.ReadFile(file)
if err != nil {
panic(err)
}
//Parse data as json without interface
var result map[string]interface{}
err = json.Unmarshal(data, &result)
if err != nil {
panic(err)
}
//Get value from key
key := "searchResults"
value := result[key]
//Check for errors
is_error := value.(map[string]interface{})["jobSearchError"]
//Skip the file if is_error is True
if is_error == true {
fmt.Println("Error")
continue
}
//Get jobs from the json
jobs := value.(map[string]interface{})["jobs"]
//Add all jobs to the all_jobs slice
for _, job := range jobs.([]interface{}) {
all_jobs = append(all_jobs, job.(map[string]interface{}))
}
}
jobs := map[string]interface{}{
"jobs": all_jobs,
}
json_data, err := json.Marshal(jobs)
if err != nil {
panic(err)
}
err = save_to_file(string(json_data), "all_jobs.json")
if err != nil {
panic(err)
}
os.RemoveAll("data")
}

46
network/httpClient.go Normal file
View File

@@ -0,0 +1,46 @@
package network
import "github.com/Danny-Dasilva/CycleTLS/cycletls"
type Client struct {
httpClient cycletls.CycleTLS
options cycletls.Options
}
func InitClient(headers map[string]string) *Client {
client := cycletls.Init()
options := cycletls.Options{
Body: "",
Ja3: "771,4865-4867-4866-49195-49199-52393-52392-49196-49200-49162-49161-49171-49172-51-57-47-53-10,0-23-65281-10-11-35-16-5-51-43-13-45-28-21,29-23-24-25-256-257,0",
UserAgent: "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0",
Headers: headers,
}
httpClient := Client{
httpClient: client,
options: options,
}
return &httpClient
}
func (c Client) GetRequest(url string) (cycletls.Response, error) {
var res cycletls.Response
response, error := c.httpClient.Do(url, c.options, "GET")
if error != nil {
return res, error
}
return response, nil
}
func (c Client) PostRequest(url string, body string) (cycletls.Response, error) {
var res cycletls.Response
c.options.Body = body
response, error := c.httpClient.Do(url, c.options, "POST")
if error != nil {
return res, error
}
return response, nil
}

57
upwork/upworkClient.go Normal file
View File

@@ -0,0 +1,57 @@
package upwork
import (
"fmt"
"scrapers/network"
)
type Upwork struct {
UpworkHttpClient *network.Client
}
type UrlArgs struct {
Page int
Per_Page int
Query string
}
func (u Upwork) ConstructUrl(args UrlArgs) string {
url := "https://www.upwork.com/search/jobs/url?q=%s&per_page=%d&sort=recency&page=%d"
return fmt.Sprintf(url, args.Query, args.Per_Page, args.Page)
}
func (u Upwork) SendRequest(url string) (string, error) {
Upclient := u.UpworkHttpClient
resp, err := Upclient.GetRequest(url)
if err != nil {
return "", err
}
return resp.Body, nil
}
func InitUpwork() *Upwork {
headers := map[string]string{
"authority": "www.upwork.com",
"accept": "application/json, text/plain",
"accept-language": "en",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://www.upwork.com/search/jobs/url?per_page=10&sort=recency",
"sec-fetch-site": "same-origin",
"sec-gpc": "1",
"vnd-eo-parent-span-id": "2724011d-2430-47f5-b5b9-603f2e919685",
"vnd-eo-span-id": "9d6e5b36-ace2-402e-a188-01da1d6b84ee",
"x-odesk-user-agent": "oDesk LM",
"x-requested-with": "XMLHttpRequest",
}
client := network.InitClient(headers)
upwork := Upwork{
UpworkHttpClient: client,
}
return &upwork
}

166
upwork/upworkPipeline.go Normal file
View File

@@ -0,0 +1,166 @@
package upwork
import (
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
)
type UpworkPipeLine struct {
upworkClient *Upwork
iterations int
}
func InitPipeline() *UpworkPipeLine {
u := UpworkPipeLine{
upworkClient: InitUpwork(),
iterations: 5,
}
return &u
}
func (u *UpworkPipeLine) CombineFiles() error {
var all_jobs []interface{}
all_filenames, err := filepath.Glob("data/*.json")
if err != nil {
panic(err)
}
for _, file := range all_filenames {
data, err := os.ReadFile(file)
if err != nil {
panic(err)
}
var result map[string]interface{}
//Parse data as json
err = json.Unmarshal([]byte(data), &result)
if err != nil {
panic(err)
}
key := "searchResults"
value := result[key]
jobs := value.(map[string]interface{})["jobs"].([]interface{})
for _, job := range jobs {
all_jobs = append(all_jobs, job.(map[string]interface{}))
}
//save to file
filename := "all_jobs.json"
//Convert to json
json_data, err := json.Marshal(all_jobs)
if err != nil {
panic(err)
}
err = u.saveToFile(json_data, filename)
if err != nil {
panic(err)
}
}
return nil
}
func (u *UpworkPipeLine) saveToFile(data []byte, filename string) error {
err := os.WriteFile(filename, data, 0644)
if err != nil {
return err
}
return nil
}
func isJSON(s string) bool {
var js map[string]interface{}
return json.Unmarshal([]byte(s), &js) == nil
}
func isApiError(data string) bool {
var result map[string]interface{}
//Parse data as json
err := json.Unmarshal([]byte(data), &result)
if err != nil {
panic(err)
}
//Get value from key
key := "searchResults"
value := result[key]
//Check for errors
is_error := value.(map[string]interface{})["jobSearchError"]
return is_error == true
}
func (u *UpworkPipeLine) validateResponse(data string) bool {
return isJSON(data) && !isApiError(data)
}
func (u *UpworkPipeLine) handleRequest(urlArgs UrlArgs, iteration int) {
client := u.upworkClient
url := client.ConstructUrl(urlArgs)
resp, err := u.upworkClient.SendRequest(url)
if err != nil {
log.Fatal(err)
//check if response is valid
} else if u.validateResponse(resp) {
filename := fmt.Sprintf("data/%d.json", iteration)
// Convert resp to array of bytes
err = u.saveToFile([]byte(resp), filename)
if err != nil {
log.Println(err)
}
} else {
log.Println("Invalid response returned")
}
}
func (u *UpworkPipeLine) Run(query string) error {
err := os.RemoveAll("data")
if err != nil {
return err
}
err = os.Mkdir("data", 0755)
if err != nil {
return err
}
var iteration int
for iteration = 1; iteration <= u.iterations; iteration++ {
log.Println("Iteration: ", iteration)
urlArgs := UrlArgs{
Page: iteration,
Per_Page: 100,
Query: "Shopify",
}
//It is possible to use a go routine here but be nice to the api or you will be rate limited pretty quickly. It is technically possible to circumvent it using a proxy but it is not recommended.
// go u.handleRequest(urlArgs, i)
u.handleRequest(urlArgs, iteration)
}
err = u.CombineFiles()
os.RemoveAll("data")
if err != nil {
panic(err)
}
return nil
}