mirror of
https://github.com/hashiromer/Upwork-Jobs-scraper-.git
synced 2025-12-29 16:16:01 +00:00
Refactored code
This commit is contained in:
57
upwork/upworkClient.go
Normal file
57
upwork/upworkClient.go
Normal file
@@ -0,0 +1,57 @@
|
||||
package upwork
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"scrapers/network"
|
||||
)
|
||||
|
||||
type Upwork struct {
|
||||
UpworkHttpClient *network.Client
|
||||
}
|
||||
|
||||
type UrlArgs struct {
|
||||
Page int
|
||||
Per_Page int
|
||||
Query string
|
||||
}
|
||||
|
||||
func (u Upwork) ConstructUrl(args UrlArgs) string {
|
||||
url := "https://www.upwork.com/search/jobs/url?q=%s&per_page=%d&sort=recency&page=%d"
|
||||
|
||||
return fmt.Sprintf(url, args.Query, args.Per_Page, args.Page)
|
||||
}
|
||||
|
||||
func (u Upwork) SendRequest(url string) (string, error) {
|
||||
Upclient := u.UpworkHttpClient
|
||||
resp, err := Upclient.GetRequest(url)
|
||||
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return resp.Body, nil
|
||||
|
||||
}
|
||||
|
||||
func InitUpwork() *Upwork {
|
||||
headers := map[string]string{
|
||||
"authority": "www.upwork.com",
|
||||
"accept": "application/json, text/plain",
|
||||
"accept-language": "en",
|
||||
"cache-control": "no-cache",
|
||||
"pragma": "no-cache",
|
||||
"referer": "https://www.upwork.com/search/jobs/url?per_page=10&sort=recency",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sec-gpc": "1",
|
||||
"vnd-eo-parent-span-id": "2724011d-2430-47f5-b5b9-603f2e919685",
|
||||
"vnd-eo-span-id": "9d6e5b36-ace2-402e-a188-01da1d6b84ee",
|
||||
"x-odesk-user-agent": "oDesk LM",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
}
|
||||
client := network.InitClient(headers)
|
||||
upwork := Upwork{
|
||||
UpworkHttpClient: client,
|
||||
}
|
||||
|
||||
return &upwork
|
||||
|
||||
}
|
||||
166
upwork/upworkPipeline.go
Normal file
166
upwork/upworkPipeline.go
Normal file
@@ -0,0 +1,166 @@
|
||||
package upwork
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
type UpworkPipeLine struct {
|
||||
upworkClient *Upwork
|
||||
iterations int
|
||||
}
|
||||
|
||||
func InitPipeline() *UpworkPipeLine {
|
||||
u := UpworkPipeLine{
|
||||
upworkClient: InitUpwork(),
|
||||
iterations: 5,
|
||||
}
|
||||
return &u
|
||||
}
|
||||
|
||||
func (u *UpworkPipeLine) CombineFiles() error {
|
||||
|
||||
var all_jobs []interface{}
|
||||
all_filenames, err := filepath.Glob("data/*.json")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
for _, file := range all_filenames {
|
||||
data, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
var result map[string]interface{}
|
||||
|
||||
//Parse data as json
|
||||
err = json.Unmarshal([]byte(data), &result)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
key := "searchResults"
|
||||
value := result[key]
|
||||
jobs := value.(map[string]interface{})["jobs"].([]interface{})
|
||||
for _, job := range jobs {
|
||||
all_jobs = append(all_jobs, job.(map[string]interface{}))
|
||||
}
|
||||
|
||||
//save to file
|
||||
filename := "all_jobs.json"
|
||||
|
||||
//Convert to json
|
||||
json_data, err := json.Marshal(all_jobs)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
err = u.saveToFile(json_data, filename)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (u *UpworkPipeLine) saveToFile(data []byte, filename string) error {
|
||||
|
||||
err := os.WriteFile(filename, data, 0644)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
|
||||
}
|
||||
|
||||
func isJSON(s string) bool {
|
||||
var js map[string]interface{}
|
||||
return json.Unmarshal([]byte(s), &js) == nil
|
||||
|
||||
}
|
||||
|
||||
func isApiError(data string) bool {
|
||||
var result map[string]interface{}
|
||||
|
||||
//Parse data as json
|
||||
err := json.Unmarshal([]byte(data), &result)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
//Get value from key
|
||||
key := "searchResults"
|
||||
value := result[key]
|
||||
|
||||
//Check for errors
|
||||
is_error := value.(map[string]interface{})["jobSearchError"]
|
||||
|
||||
return is_error == true
|
||||
}
|
||||
|
||||
func (u *UpworkPipeLine) validateResponse(data string) bool {
|
||||
return isJSON(data) && !isApiError(data)
|
||||
|
||||
}
|
||||
|
||||
func (u *UpworkPipeLine) handleRequest(urlArgs UrlArgs, iteration int) {
|
||||
client := u.upworkClient
|
||||
url := client.ConstructUrl(urlArgs)
|
||||
resp, err := u.upworkClient.SendRequest(url)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
|
||||
//check if response is valid
|
||||
} else if u.validateResponse(resp) {
|
||||
filename := fmt.Sprintf("data/%d.json", iteration)
|
||||
// Convert resp to array of bytes
|
||||
err = u.saveToFile([]byte(resp), filename)
|
||||
if err != nil {
|
||||
log.Println(err)
|
||||
}
|
||||
} else {
|
||||
log.Println("Invalid response returned")
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func (u *UpworkPipeLine) Run(query string) error {
|
||||
|
||||
err := os.RemoveAll("data")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Mkdir("data", 0755)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var iteration int
|
||||
for iteration = 1; iteration <= u.iterations; iteration++ {
|
||||
log.Println("Iteration: ", iteration)
|
||||
urlArgs := UrlArgs{
|
||||
Page: iteration,
|
||||
Per_Page: 100,
|
||||
Query: "Shopify",
|
||||
}
|
||||
//It is possible to use a go routine here but be nice to the api or you will be rate limited pretty quickly. It is technically possible to circumvent it using a proxy but it is not recommended.
|
||||
// go u.handleRequest(urlArgs, i)
|
||||
u.handleRequest(urlArgs, iteration)
|
||||
|
||||
}
|
||||
|
||||
err = u.CombineFiles()
|
||||
os.RemoveAll("data")
|
||||
|
||||
if err != nil {
|
||||
panic(err)
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user